diff --git a/.config/hakari.toml b/.config/hakari.toml index 3b6d9d8822..9991cd92b0 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -33,6 +33,7 @@ workspace-members = [ "compute_api", "consumption_metrics", "desim", + "json", "metrics", "pageserver_api", "postgres_backend", diff --git a/.github/actionlint.yml b/.github/actionlint.yml index b7e0be761a..3142a36fa0 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -7,6 +7,7 @@ self-hosted-runner: - small-metal - small-arm64 - unit-perf + - unit-perf-aws-arm - us-east-2 config-variables: - AWS_ECR_REGION diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 7b2c9c2ce3..2296807d2d 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -32,162 +32,14 @@ permissions: contents: read jobs: - build-pgxn: - if: | - inputs.pg_versions != '[]' || inputs.rebuild_everything || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - timeout-minutes: 30 - runs-on: macos-15 - strategy: - matrix: - postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - name: Checkout main repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Set pg ${{ matrix.postgres-version }} for caching - id: pg_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}" - - - name: Cache postgres ${{ matrix.postgres-version }} build - id: cache_pg - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: pg_install/${{ matrix.postgres-version }} - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }} - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - git submodule init vendor/postgres-${{ matrix.postgres-version }} - git submodule update --depth 1 --recursive - - - name: Install build dependencies - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - brew install flex bison openssl protobuf icu4c - - - name: Set extra env for macOS - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - - name: Build Postgres ${{ matrix.postgres-version }} - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) - - - name: Build Neon Pg Ext ${{ matrix.postgres-version }} - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu) - - - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: pg_install--${{ matrix.postgres-version }} - path: pg_install/${{ matrix.postgres-version }} - # The artifact is supposed to be used by the next job in the same workflow, - # so there’s no need to store it for too long. - retention-days: 1 - - build-walproposer-lib: - if: | - contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || - contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || - github.ref_name == 'main' - timeout-minutes: 30 - runs-on: macos-15 - needs: [build-pgxn] - env: - # Use release build only, to have less debug info around - # Hence keeping target/ (and general cache size) smaller - BUILD_TYPE: release - steps: - - name: Harden the runner (Audit all outbound calls) - uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 - with: - egress-policy: audit - - - name: Checkout main repo - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: Set pg v17 for caching - id: pg_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}" - - - name: Download "pg_install/v17" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: pg_install--v17 - path: pg_install/v17 - - # `actions/download-artifact` doesn't preserve permissions: - # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss - - name: Make pg_install/v*/bin/* executable - run: | - chmod +x pg_install/v*/bin/* - - - name: Cache walproposer-lib - id: cache_walproposer_lib - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 - with: - path: build/walproposer-lib - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Checkout submodule vendor/postgres-v17 - if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' - run: | - git submodule init vendor/postgres-v17 - git submodule update --depth 1 --recursive - - - name: Install build dependencies - if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' - run: | - brew install flex bison openssl protobuf icu4c - - - name: Set extra env for macOS - if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' - run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - - name: Build walproposer-lib (only for v17) - if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' - run: - make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1 - - - name: Upload "build/walproposer-lib" artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - with: - name: build--walproposer-lib - path: build/walproposer-lib - # The artifact is supposed to be used by the next job in the same workflow, - # so there’s no need to store it for too long. - retention-days: 1 - - cargo-build: + make-all: if: | inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - timeout-minutes: 30 + timeout-minutes: 60 runs-on: macos-15 - needs: [build-pgxn, build-walproposer-lib] env: # Use release build only, to have less debug info around # Hence keeping target/ (and general cache size) smaller @@ -203,41 +55,53 @@ jobs: with: submodules: true - - name: Download "pg_install/v14" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: pg_install--v14 - path: pg_install/v14 - - - name: Download "pg_install/v15" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: pg_install--v15 - path: pg_install/v15 - - - name: Download "pg_install/v16" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: pg_install--v16 - path: pg_install/v16 - - - name: Download "pg_install/v17" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: pg_install--v17 - path: pg_install/v17 - - - name: Download "build/walproposer-lib" artifact - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 - with: - name: build--walproposer-lib - path: build/walproposer-lib - - # `actions/download-artifact` doesn't preserve permissions: - # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss - - name: Make pg_install/v*/bin/* executable + - name: Install build dependencies run: | - chmod +x pg_install/v*/bin/* + brew install flex bison openssl protobuf icu4c + + - name: Set extra env for macOS + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Restore "pg_install/" cache + id: cache_pg + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: pg_install + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-install-v14-${{ hashFiles('Makefile', 'postgres.mk', 'vendor/revisions.json') }} + + - name: Checkout vendor/postgres submodules + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + git submodule init + git submodule update --depth 1 --recursive + + - name: Build Postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + make postgres -j$(sysctl -n hw.ncpu) + + # This isn't strictly necessary, but it makes the cached and non-cached builds more similar, + # When pg_install is restored from cache, there is no 'build/' directory. By removing it + # in a non-cached build too, we enforce that the rest of the steps don't depend on it, + # so that we notice any build caching bugs earlier. + - name: Remove build artifacts + if: steps.cache_pg.outputs.cache-hit != 'true' + run: | + rm -rf build + + # Explicitly update the rust toolchain before running 'make'. The parallel make build can + # invoke 'cargo build' more than once in parallel, for different crates. That's OK, 'cargo' + # does its own locking to prevent concurrent builds from stepping on each other's + # toes. However, it will first try to update the toolchain, and that step is not locked the + # same way. To avoid two toolchain updates running in parallel and stepping on each other's + # toes, ensure that the toolchain is up-to-date beforehand. + - name: Update rust toolchain + run: | + rustup --version && + rustup update && + rustup show - name: Cache cargo deps uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 @@ -249,17 +113,12 @@ jobs: target key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - - name: Install build dependencies - run: | - brew install flex bison openssl protobuf icu4c - - - name: Set extra env for macOS - run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - - name: Run cargo build - run: cargo build --all --release -j$(sysctl -n hw.ncpu) + # Build the neon-specific postgres extensions, and all the Rust bits. + # + # Pass PG_INSTALL_CACHED=1 because PostgreSQL was already built and cached + # separately. + - name: Build all + run: PG_INSTALL_CACHED=1 BUILD_TYPE=release make -j$(sysctl -n hw.ncpu) all - name: Check that no warnings are produced run: ./run_clippy.sh diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 456c7b8c92..864abad574 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -306,14 +306,14 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, unit-perf ] + runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file - options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined strategy: fail-fast: false matrix: @@ -986,6 +986,7 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 60 env: + PARALLEL_COMPUTES: 3 TAG: >- ${{ needs.meta.outputs.run-kind == 'compute-rc-pr' diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 317db94052..728a6d4956 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -1,4 +1,4 @@ -name: Periodic pagebench performance test on unit-perf hetzner runner +name: Periodic pagebench performance test on unit-perf-aws-arm runners on: schedule: @@ -40,7 +40,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [ self-hosted, unit-perf ] + runs-on: [ self-hosted, unit-perf-aws-arm ] container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml index 3a98ad4e8e..0ae93ce295 100644 --- a/.github/workflows/proxy-benchmark.yml +++ b/.github/workflows/proxy-benchmark.yml @@ -1,4 +1,4 @@ -name: Periodic proxy performance test on unit-perf hetzner runner +name: Periodic proxy performance test on unit-perf-aws-arm runners on: push: # TODO: remove after testing @@ -32,7 +32,7 @@ jobs: statuses: write contents: write pull-requests: write - runs-on: [self-hosted, unit-perf] + runs-on: [self-hosted, unit-perf-aws-arm] timeout-minutes: 60 # 1h timeout container: image: ghcr.io/neondatabase/build-tools:pinned-bookworm diff --git a/.gitignore b/.gitignore index 6574d7b9de..4857972f1d 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ neon.iml /.neon /integration_tests/.neon compaction-suite-results.* +docker-compose/docker-compose-parallel.yml # Coverage *.profraw diff --git a/Cargo.lock b/Cargo.lock index 4c9cfa97e1..893932fb9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1083,6 +1083,25 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cbindgen" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684" +dependencies = [ + "clap", + "heck", + "indexmap 2.9.0", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn 2.0.100", + "tempfile", + "toml", +] + [[package]] name = "cc" version = "1.2.16" @@ -1267,6 +1286,15 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "communicator" +version = "0.1.0" +dependencies = [ + "cbindgen", + "neon-shmem", + "workspace_hack", +] + [[package]] name = "compute_api" version = "0.1.0" @@ -3461,6 +3489,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json" +version = "0.1.0" +dependencies = [ + "futures", + "itoa", + "ryu", +] + [[package]] name = "json-structural-diff" version = "0.2.0" @@ -4302,6 +4339,7 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", + "base64 0.22.1", "bincode", "bit_field", "byteorder", @@ -4455,6 +4493,21 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client_grpc" +version = "0.1.0" +dependencies = [ + "anyhow", + "futures", + "pageserver_page_api", + "tokio", + "tokio-stream", + "tonic 0.13.1", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pageserver_compaction" version = "0.1.0" @@ -5647,6 +5700,8 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", + "byteorder", "bytes", "camino", "camino-tempfile", @@ -8665,8 +8720,10 @@ dependencies = [ "fail", "form_urlencoded", "futures-channel", + "futures-core", "futures-executor", "futures-io", + "futures-sink", "futures-util", "generic-array", "getrandom 0.2.11", @@ -8693,6 +8750,7 @@ dependencies = [ "num-iter", "num-rational", "num-traits", + "once_cell", "p256 0.13.2", "parquet", "prettyplease", diff --git a/Cargo.toml b/Cargo.toml index 7728f6d8fe..14f2cfcb56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "pageserver/compaction", "pageserver/ctl", "pageserver/client", + "pageserver/client_grpc", "pageserver/pagebench", "pageserver/page_api", "proxy", @@ -42,10 +43,12 @@ members = [ "libs/walproposer", "libs/wal_decoder", "libs/postgres_initdb", + "libs/proxy/json", "libs/proxy/postgres-protocol2", "libs/proxy/postgres-types2", "libs/proxy/tokio-postgres2", "endpoint_storage", + "pgxn/neon/communicator", ] [workspace.package] @@ -255,6 +258,7 @@ desim = { version = "0.1", path = "./libs/desim" } endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" } http-utils = { version = "0.1", path = "./libs/http-utils/" } metrics = { version = "0.1", path = "./libs/metrics/" } +neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" } pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } @@ -284,6 +288,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" } workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies +cbindgen = "0.29.0" criterion = "0.5.1" rcgen = "0.13" rstest = "0.18" diff --git a/Dockerfile b/Dockerfile index d518370ab8..55b87d4012 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,7 +30,18 @@ ARG BASE_IMAGE_SHA=debian:${DEBIAN_FLAVOR} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bookworm-slim/debian@$BOOKWORM_SLIM_SHA} ARG BASE_IMAGE_SHA=${BASE_IMAGE_SHA/debian:bullseye-slim/debian@$BULLSEYE_SLIM_SHA} -# Build Postgres +# Naive way: +# +# 1. COPY . . +# 1. make neon-pg-ext +# 2. cargo build +# +# But to enable docker to cache intermediate layers, we perform a few preparatory steps: +# +# - Build all postgres versions, depending on just the contents of vendor/ +# - Use cargo chef to build all rust dependencies + +# 1. Build all postgres versions FROM $REPOSITORY/$IMAGE:$TAG AS pg-build WORKDIR /home/nonroot @@ -38,17 +49,15 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 -COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot postgres.mk postgres.mk COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE=release RUN set -e \ - && mold -run make -j $(nproc) -s neon-pg-ext \ - && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . + && mold -run make -j $(nproc) -s postgres -# Prepare cargo-chef recipe +# 2. Prepare cargo-chef recipe FROM $REPOSITORY/$IMAGE:$TAG AS plan WORKDIR /home/nonroot @@ -56,23 +65,22 @@ COPY --chown=nonroot . . RUN cargo chef prepare --recipe-path recipe.json -# Build neon binaries +# Main build image FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG - -COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server -COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server -COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server -COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server -COPY --from=plan /home/nonroot/recipe.json recipe.json - ARG ADDITIONAL_RUSTFLAGS="" +# 3. Build cargo dependencies. Note that this step doesn't depend on anything else than +# `recipe.json`, so the layer can be reused as long as none of the dependencies change. +COPY --from=plan /home/nonroot/recipe.json recipe.json RUN set -e \ && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json +# Perform the main build. We reuse the Postgres build artifacts from the intermediate 'pg-build' +# layer, and the cargo dependencies built in the previous step. +COPY --chown=nonroot --from=pg-build /home/nonroot/pg_install/ pg_install COPY --chown=nonroot . . RUN set -e \ @@ -87,10 +95,10 @@ RUN set -e \ --bin endpoint_storage \ --bin neon_local \ --bin storage_scrubber \ - --locked --release + --locked --release \ + && mold -run make -j $(nproc) -s neon-pg-ext -# Build final image -# +# Assemble the final image FROM $BASE_IMAGE_SHA WORKDIR /data @@ -130,12 +138,15 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin +COPY --from=build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=build /home/nonroot/pg_install/v15 /usr/local/v15/ +COPY --from=build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=build /home/nonroot/pg_install/v17 /usr/local/v17/ -COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ -COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ -COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ -COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ -COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ +# Deprecated: Old deployment scripts use this tarball which contains all the Postgres binaries. +# That's obsolete, since all the same files are also present under /usr/local/v*. But to keep the +# old scripts working for now, create the tarball. +RUN tar -C /usr/local -cvzf /data/postgres_install.tar.gz v14 v15 v16 v17 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. # Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. diff --git a/Makefile b/Makefile index 7f8f436a2e..4b31e26810 100644 --- a/Makefile +++ b/Makefile @@ -30,11 +30,18 @@ ifeq ($(BUILD_TYPE),release) PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) CARGO_PROFILE ?= --profile=release + # NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places + # the final build artifacts. There is unfortunately no easy way of changing + # it to a fully predictable path, nor to extract the path with a simple + # command. See https://github.com/rust-lang/cargo/issues/9661 and + # https://github.com/rust-lang/cargo/issues/6790. + NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) CARGO_PROFILE ?= --profile=dev + NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif @@ -102,7 +109,7 @@ all: neon postgres-install neon-pg-ext ### Neon Rust bits # -# The 'postgres_ffi' depends on the Postgres headers. +# The 'postgres_ffi' crate depends on the Postgres headers. .PHONY: neon neon: postgres-headers-install walproposer-lib cargo-target-dir +@echo "Compiling Neon" @@ -115,10 +122,13 @@ cargo-target-dir: test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG .PHONY: neon-pg-ext-% -neon-pg-ext-%: postgres-install-% +neon-pg-ext-%: postgres-install-% cargo-target-dir +@echo "Compiling neon-specific Postgres extensions for $*" mkdir -p $(BUILD_DIR)/pgxn-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ + $(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \ + NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \ + CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \ + CARGO_PROFILE="$(CARGO_PROFILE)" \ -C $(BUILD_DIR)/pgxn-$*\ -f $(ROOT_PROJECT_DIR)/pgxn/Makefile install diff --git a/clippy.toml b/clippy.toml index 408232488c..c03059053a 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,9 +1,12 @@ disallowed-methods = [ "tokio::task::block_in_place", + # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", - # use tokio_epoll_uring_ext instead - "tokio_epoll_uring::thread_local_system", + + # tokio-epoll-uring: + # - allow-invalid because the method doesn't exist on macOS + { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true } ] disallowed-macros = [ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 111e64d5d1..39136fe573 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1572,6 +1572,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pgaudit-src ARG PG_VERSION WORKDIR /ext-src +COPY "compute/patches/pgaudit-parallel_workers-${PG_VERSION}.patch" . RUN case "${PG_VERSION}" in \ "v14") \ export PGAUDIT_VERSION=1.6.3 \ @@ -1594,7 +1595,8 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ - mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . && \ + patch -p1 < "/ext-src/pgaudit-parallel_workers-${PG_VERSION}.patch" FROM pg-build AS pgaudit-build COPY --from=pgaudit-src /ext-src/ /ext-src/ @@ -1634,11 +1636,14 @@ RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN) # compile neon extensions # ######################################################################################### -FROM pg-build AS neon-ext-build +FROM pg-build-with-cargo AS neon-ext-build ARG PG_VERSION -COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute +USER root +COPY . . + +RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute \ + BUILD_TYPE=release CARGO_BUILD_FLAGS="--locked --release" NEON_CARGO_ARTIFACT_TARGET_DIR="$(pwd)/target/release" ######################################################################################### # @@ -1910,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig -RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \ +RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \ && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/* ENV PATH=/usr/local/pgsql/bin:$PATH -ENV PGHOST=compute +ENV PGHOST=compute1 ENV PGPORT=55433 ENV PGUSER=cloud_admin ENV PGDATABASE=postgres diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index e64d907fe4..b712631d71 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -8,6 +8,8 @@ import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', import 'sql_exporter/compute_max_connections.libsonnet', + import 'sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet', + import 'sql_exporter/compute_pg_oldest_mxid_age.libsonnet', import 'sql_exporter/compute_receive_lsn.libsonnet', import 'sql_exporter/compute_subscriptions_count.libsonnet', import 'sql_exporter/connection_counts.libsonnet', diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet new file mode 100644 index 0000000000..03d5cf860f --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'compute_pg_oldest_frozen_xid_age', + type: 'gauge', + help: 'Age of oldest XIDs that have not been frozen by VACUUM. An indicator of how long it has been since VACUUM last ran.', + key_labels: [ + 'database_name', + ], + value_label: 'metric', + values: [ + 'frozen_xid_age', + ], + query: importstr 'sql_exporter/compute_pg_oldest_frozen_xid_age.sql', +} diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql new file mode 100644 index 0000000000..d2281fdd42 --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -0,0 +1,4 @@ +SELECT datname database_name, + age(datfrozenxid) frozen_xid_age +FROM pg_database +ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet new file mode 100644 index 0000000000..12063a0f71 --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'compute_pg_oldest_mxid_age', + type: 'gauge', + help: 'Age of oldest MXIDs that have not been replaced by VACUUM. An indicator of how long it has been since VACUUM last ran.', + key_labels: [ + 'database_name', + ], + value_label: 'metric', + values: [ + 'min_mxid_age', + ], + query: importstr 'sql_exporter/compute_pg_oldest_mxid_age.sql', +} diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql new file mode 100644 index 0000000000..ed57894b3a --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -0,0 +1,4 @@ +SELECT datname database_name, + mxid_age(datminmxid) min_mxid_age +FROM pg_database +ORDER BY min_mxid_age DESC LIMIT 10; diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch index e833a6dfd3..4faf927e39 100644 --- a/compute/patches/anon_v2.patch +++ b/compute/patches/anon_v2.patch @@ -1,8 +1,8 @@ diff --git a/sql/anon.sql b/sql/anon.sql -index 0cdc769..f6cc950 100644 +index 0cdc769..b450327 100644 --- a/sql/anon.sql +++ b/sql/anon.sql -@@ -1141,3 +1141,8 @@ $$ +@@ -1141,3 +1141,15 @@ $$ -- TODO : https://en.wikipedia.org/wiki/L-diversity -- TODO : https://en.wikipedia.org/wiki/T-closeness @@ -11,6 +11,13 @@ index 0cdc769..f6cc950 100644 + +GRANT ALL ON SCHEMA anon to neon_superuser; +GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; ++ ++DO $$ ++BEGIN ++ IF current_setting('server_version_num')::int >= 150000 THEN ++ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser; ++ END IF; ++END $$; diff --git a/sql/init.sql b/sql/init.sql index 7da6553..9b6164b 100644 --- a/sql/init.sql diff --git a/compute/patches/pgaudit-parallel_workers-v14.patch b/compute/patches/pgaudit-parallel_workers-v14.patch new file mode 100644 index 0000000000..5517d3105b --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v14.patch @@ -0,0 +1,143 @@ +commit 7220bb3a3f23fa27207d77562dcc286f9a123313 +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index baa8011..a601375 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2563,6 +2563,37 @@ COMMIT; + NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, + DROP TABLE part_test; + NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 5e6fd38..ac9ded2 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index cc1374a..1870a60 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1612,6 +1612,36 @@ COMMIT; + + DROP TABLE part_test; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v15.patch b/compute/patches/pgaudit-parallel_workers-v15.patch new file mode 100644 index 0000000000..6dfffbd0dd --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v15.patch @@ -0,0 +1,143 @@ +commit 29dc2847f6255541992f18faf8a815dfab79631a +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index b22560b..73f0327 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2563,6 +2563,37 @@ COMMIT; + NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, + DROP TABLE part_test; + NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 5e6fd38..ac9ded2 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index 8052426..7f0667b 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1612,6 +1612,36 @@ COMMIT; + + DROP TABLE part_test; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v16.patch b/compute/patches/pgaudit-parallel_workers-v16.patch new file mode 100644 index 0000000000..6b8b276b7b --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v16.patch @@ -0,0 +1,143 @@ +commit cc708dde7ef2af2a8120d757102d2e34c0463a0f +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index 8772054..9b66ac6 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2556,6 +2556,37 @@ DROP SERVER fdw_server; + NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server;, + DROP EXTENSION postgres_fdw; + NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 004d1f9..f061164 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1339,7 +1340,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1420,7 +1421,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1475,7 +1476,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1495,7 +1496,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index 6aae88b..de6d7fd 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1631,6 +1631,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; + DROP SERVER fdw_server; + DROP EXTENSION postgres_fdw; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v17.patch b/compute/patches/pgaudit-parallel_workers-v17.patch new file mode 100644 index 0000000000..f99be10c60 --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v17.patch @@ -0,0 +1,143 @@ +commit 8d02e4c6c5e1e8676251b0717a46054267091cb4 +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index d696287..4b1059a 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2568,6 +2568,37 @@ DROP SERVER fdw_server; + NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server, + DROP EXTENSION postgres_fdw; + NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 1764af1..0e48875 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1406,7 +1407,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit event onto the stack */ + stackItem = stack_push(); +@@ -1489,7 +1490,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1544,7 +1545,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1564,7 +1565,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index e161f01..c873098 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1637,6 +1637,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; + DROP SERVER fdw_server; + DROP EXTENSION postgres_fdw; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index fae76579d8..ec6e6c1634 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -29,7 +29,8 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; -use tokio::spawn; +use tokio::task::JoinHandle; +use tokio::{spawn, time}; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::id::{TenantId, TimelineId}; @@ -107,6 +108,8 @@ pub struct ComputeNodeParams { pub installed_extensions_collection_interval: Arc, } +type TaskHandle = Mutex>>; + /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { pub params: ComputeNodeParams, @@ -129,7 +132,8 @@ pub struct ComputeNode { pub compute_ctl_config: ComputeCtlConfig, /// Handle to the extension stats collection task - extension_stats_task: Mutex>>, + extension_stats_task: TaskHandle, + lfc_offload_task: TaskHandle, } // store some metrics about download size that might impact startup time @@ -368,7 +372,7 @@ fn maybe_cgexec(cmd: &str) -> Command { struct PostgresHandle { postgres: std::process::Child, - log_collector: tokio::task::JoinHandle>, + log_collector: JoinHandle>, } impl PostgresHandle { @@ -382,7 +386,7 @@ struct StartVmMonitorResult { #[cfg(target_os = "linux")] token: tokio_util::sync::CancellationToken, #[cfg(target_os = "linux")] - vm_monitor: Option>>, + vm_monitor: Option>>, } impl ComputeNode { @@ -433,6 +437,7 @@ impl ComputeNode { ext_download_progress: RwLock::new(HashMap::new()), compute_ctl_config: config.compute_ctl_config, extension_stats_task: Mutex::new(None), + lfc_offload_task: Mutex::new(None), }) } @@ -520,8 +525,8 @@ impl ComputeNode { None }; - // Terminate the extension stats collection task this.terminate_extension_stats_task(); + this.terminate_lfc_offload_task(); // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. @@ -851,12 +856,15 @@ impl ComputeNode { // Log metrics so that we can search for slow operations in logs info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished"); - // Spawn the extension stats background task self.spawn_extension_stats_task(); if pspec.spec.autoprewarm { + info!("autoprewarming on startup as requested"); self.prewarm_lfc(None); } + if let Some(seconds) = pspec.spec.offload_lfc_interval_seconds { + self.spawn_lfc_offload_task(Duration::from_secs(seconds.into())); + }; Ok(()) } @@ -1049,7 +1057,7 @@ impl ComputeNode { }; let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( shard0_connstr, spec.tenant_id, spec.timeline_id, @@ -2357,10 +2365,7 @@ LIMIT 100", } pub fn spawn_extension_stats_task(&self) { - // Cancel any existing task - if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { - handle.abort(); - } + self.terminate_extension_stats_task(); let conf = self.tokio_conn_conf.clone(); let atomic_interval = self.params.installed_extensions_collection_interval.clone(); @@ -2371,24 +2376,23 @@ LIMIT 100", installed_extensions_collection_interval ); let handle = tokio::spawn(async move { - // An initial sleep is added to ensure that two collections don't happen at the same time. - // The first collection happens during compute startup. - tokio::time::sleep(tokio::time::Duration::from_secs( - installed_extensions_collection_interval, - )) - .await; - let mut interval = tokio::time::interval(tokio::time::Duration::from_secs( - installed_extensions_collection_interval, - )); loop { - interval.tick().await; + info!( + "[NEON_EXT_INT_SLEEP]: Interval: {}", + installed_extensions_collection_interval + ); + // Sleep at the start of the loop to ensure that two collections don't happen at the same time. + // The first collection happens during compute startup. + tokio::time::sleep(tokio::time::Duration::from_secs( + installed_extensions_collection_interval, + )) + .await; let _ = installed_extensions(conf.clone()).await; // Acquire a read lock on the compute spec and then update the interval if necessary - interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max( + installed_extensions_collection_interval = std::cmp::max( installed_extensions_collection_interval, 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst), - ))); - installed_extensions_collection_interval = interval.period().as_secs(); + ); } }); @@ -2397,8 +2401,30 @@ LIMIT 100", } fn terminate_extension_stats_task(&self) { - if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { - handle.abort(); + if let Some(h) = self.extension_stats_task.lock().unwrap().take() { + h.abort() + } + } + + pub fn spawn_lfc_offload_task(self: &Arc, interval: Duration) { + self.terminate_lfc_offload_task(); + let secs = interval.as_secs(); + info!("spawning lfc offload worker with {secs}s interval"); + let this = self.clone(); + let handle = spawn(async move { + let mut interval = time::interval(interval); + interval.tick().await; // returns immediately + loop { + interval.tick().await; + this.offload_lfc_async().await; + } + }); + *self.lfc_offload_task.lock().unwrap() = Some(handle); + } + + fn terminate_lfc_offload_task(&self) { + if let Some(h) = self.lfc_offload_task.lock().unwrap().take() { + h.abort() } } @@ -2407,19 +2433,11 @@ LIMIT 100", // If the value is -1, we never suspend so set the value to default collection. // If the value is 0, it means default, we will just continue to use the default. if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", - spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL - ); self.params.installed_extensions_collection_interval.store( DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, std::sync::atomic::Ordering::SeqCst, ); } else { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}", - spec.suspend_timeout_seconds - ); self.params.installed_extensions_collection_interval.store( spec.suspend_timeout_seconds as u64, std::sync::atomic::Ordering::SeqCst, diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs index 1c7a7bef60..3f6f9a7ecc 100644 --- a/compute_tools/src/compute_prewarm.rs +++ b/compute_tools/src/compute_prewarm.rs @@ -5,6 +5,7 @@ use compute_api::responses::LfcOffloadState; use compute_api::responses::LfcPrewarmState; use http::StatusCode; use reqwest::Client; +use std::mem::replace; use std::sync::Arc; use tokio::{io::AsyncReadExt, spawn}; use tracing::{error, info}; @@ -88,17 +89,15 @@ impl ComputeNode { self.state.lock().unwrap().lfc_offload_state.clone() } - /// Returns false if there is a prewarm request ongoing, true otherwise + /// If there is a prewarm request ongoing, return false, true otherwise pub fn prewarm_lfc(self: &Arc, from_endpoint: Option) -> bool { - crate::metrics::LFC_PREWARM_REQUESTS.inc(); { let state = &mut self.state.lock().unwrap().lfc_prewarm_state; - if let LfcPrewarmState::Prewarming = - std::mem::replace(state, LfcPrewarmState::Prewarming) - { + if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) { return false; } } + crate::metrics::LFC_PREWARMS.inc(); let cloned = self.clone(); spawn(async move { @@ -106,7 +105,8 @@ impl ComputeNode { cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed; return; }; - error!(%err); + crate::metrics::LFC_PREWARM_ERRORS.inc(); + error!(%err, "prewarming lfc"); cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed { error: err.to_string(), }; @@ -152,32 +152,42 @@ impl ComputeNode { .map(|_| ()) } - /// Returns false if there is an offload request ongoing, true otherwise + /// If offload request is ongoing, return false, true otherwise pub fn offload_lfc(self: &Arc) -> bool { - crate::metrics::LFC_OFFLOAD_REQUESTS.inc(); { let state = &mut self.state.lock().unwrap().lfc_offload_state; - if let LfcOffloadState::Offloading = - std::mem::replace(state, LfcOffloadState::Offloading) - { + if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { return false; } } - let cloned = self.clone(); - spawn(async move { - let Err(err) = cloned.offload_lfc_impl().await else { - cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; - return; - }; - error!(%err); - cloned.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { - error: err.to_string(), - }; - }); + spawn(async move { cloned.offload_lfc_with_state_update().await }); true } + pub async fn offload_lfc_async(self: &Arc) { + { + let state = &mut self.state.lock().unwrap().lfc_offload_state; + if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading { + return; + } + } + self.offload_lfc_with_state_update().await + } + + async fn offload_lfc_with_state_update(&self) { + crate::metrics::LFC_OFFLOADS.inc(); + let Err(err) = self.offload_lfc_impl().await else { + self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed; + return; + }; + crate::metrics::LFC_OFFLOAD_ERRORS.inc(); + error!(%err, "offloading lfc"); + self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed { + error: err.to_string(), + }; + } + async fn offload_lfc_impl(&self) -> Result<()> { let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?; info!(%url, "requesting LFC state from postgres"); diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3346c18c0d..bb0828429d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc( lsn: Lsn, ) -> Result> { tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), tenant_shard_id.tenant_id, timeline_id, diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 90326b2074..91dedbb42a 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -97,20 +97,34 @@ pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy> = Lazy:: .expect("failed to define a metric") }); -/// Needed as neon.file_cache_prewarm_batch == 0 doesn't mean we never tried to prewarm. -/// On the other hand, LFC_PREWARMED_PAGES is excessive as we can GET /lfc/prewarm -pub(crate) static LFC_PREWARM_REQUESTS: Lazy = Lazy::new(|| { +pub(crate) static LFC_PREWARMS: Lazy = Lazy::new(|| { register_int_counter!( - "compute_ctl_lfc_prewarm_requests_total", - "Total number of LFC prewarm requests made by compute_ctl", + "compute_ctl_lfc_prewarms_total", + "Total number of LFC prewarms requested by compute_ctl or autoprewarm option", ) .expect("failed to define a metric") }); -pub(crate) static LFC_OFFLOAD_REQUESTS: Lazy = Lazy::new(|| { +pub(crate) static LFC_PREWARM_ERRORS: Lazy = Lazy::new(|| { register_int_counter!( - "compute_ctl_lfc_offload_requests_total", - "Total number of LFC offload requests made by compute_ctl", + "compute_ctl_lfc_prewarm_errors_total", + "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOADS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offloads_total", + "Total number of LFC offloads requested by compute_ctl or lfc_offload_period_seconds option", + ) + .expect("failed to define a metric") +}); + +pub(crate) static LFC_OFFLOAD_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "compute_ctl_lfc_offload_errors_total", + "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option", ) .expect("failed to define a metric") }); @@ -124,7 +138,9 @@ pub fn collect() -> Vec { metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics.extend(PG_CURR_DOWNTIME_MS.collect()); metrics.extend(PG_TOTAL_DOWNTIME_MS.collect()); - metrics.extend(LFC_PREWARM_REQUESTS.collect()); - metrics.extend(LFC_OFFLOAD_REQUESTS.collect()); + metrics.extend(LFC_PREWARMS.collect()); + metrics.extend(LFC_PREWARM_ERRORS.collect()); + metrics.extend(LFC_OFFLOADS.collect()); + metrics.extend(LFC_OFFLOAD_ERRORS.collect()); metrics } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 2b865c75d0..fae59082c6 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -31,6 +31,7 @@ mod pg_helpers_tests { wal_level = logical hot_standby = on autoprewarm = off +offload_lfc_interval_seconds = 20 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 3440d8979a..6021933d6a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -64,7 +64,9 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +#[allow(dead_code)] const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; +const DEFAULT_PG_VERSION_NUM: &str = "17"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; @@ -167,7 +169,7 @@ struct TenantCreateCmdArgs { #[clap(short = 'c')] config: Vec, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version to use for the initial timeline")] pg_version: PgMajorVersion, @@ -290,7 +292,7 @@ struct TimelineCreateCmdArgs { #[clap(long, help = "Human-readable alias for the new timeline")] branch_name: String, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version")] pg_version: PgMajorVersion, } @@ -322,7 +324,7 @@ struct TimelineImportCmdArgs { #[clap(long, help = "Lsn the basebackup ends at")] end_lsn: Option, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version of the backup being imported")] pg_version: PgMajorVersion, } @@ -601,7 +603,7 @@ struct EndpointCreateCmdArgs { )] config_only: bool, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version")] pg_version: PgMajorVersion, @@ -673,6 +675,16 @@ struct EndpointStartCmdArgs { #[arg(default_value = "90s")] start_timeout: Duration, + #[clap( + long, + help = "Download LFC cache from endpoint storage on endpoint startup", + default_value = "false" + )] + autoprewarm: bool, + + #[clap(long, help = "Upload LFC cache to endpoint storage periodically")] + offload_lfc_interval_seconds: Option, + #[clap( long, help = "Run in development mode, skipping VM-specific operations like process termination", @@ -1583,22 +1595,24 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res let endpoint_storage_token = env.generate_auth_token(&claims)?; let endpoint_storage_addr = env.endpoint_storage.listen_addr.to_string(); + let args = control_plane::endpoint::EndpointStartArgs { + auth_token, + endpoint_storage_token, + endpoint_storage_addr, + safekeepers_generation, + safekeepers, + pageservers, + remote_ext_base_url: remote_ext_base_url.clone(), + shard_stripe_size: stripe_size.0 as usize, + create_test_user: args.create_test_user, + start_timeout: args.start_timeout, + autoprewarm: args.autoprewarm, + offload_lfc_interval_seconds: args.offload_lfc_interval_seconds, + dev: args.dev, + }; + println!("Starting existing endpoint {endpoint_id}..."); - endpoint - .start( - &auth_token, - endpoint_storage_token, - endpoint_storage_addr, - safekeepers_generation, - safekeepers, - pageservers, - remote_ext_base_url.as_ref(), - stripe_size.0 as usize, - args.create_test_user, - args.start_timeout, - args.dev, - ) - .await?; + endpoint.start(args).await?; } EndpointCmd::Reconfigure(args) => { let endpoint_id = &args.endpoint_id; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 424101b9a4..74ab15dc97 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -373,6 +373,22 @@ impl std::fmt::Display for EndpointTerminateMode { } } +pub struct EndpointStartArgs { + pub auth_token: Option, + pub endpoint_storage_token: String, + pub endpoint_storage_addr: String, + pub safekeepers_generation: Option, + pub safekeepers: Vec, + pub pageservers: Vec<(PageserverProtocol, Host, u16)>, + pub remote_ext_base_url: Option, + pub shard_stripe_size: usize, + pub create_test_user: bool, + pub start_timeout: Duration, + pub autoprewarm: bool, + pub offload_lfc_interval_seconds: Option, + pub dev: bool, +} + impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { @@ -677,21 +693,7 @@ impl Endpoint { }) } - #[allow(clippy::too_many_arguments)] - pub async fn start( - &self, - auth_token: &Option, - endpoint_storage_token: String, - endpoint_storage_addr: String, - safekeepers_generation: Option, - safekeepers: Vec, - pageservers: Vec<(PageserverProtocol, Host, u16)>, - remote_ext_base_url: Option<&String>, - shard_stripe_size: usize, - create_test_user: bool, - start_timeout: Duration, - dev: bool, - ) -> Result<()> { + pub async fn start(&self, args: EndpointStartArgs) -> Result<()> { if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); } @@ -704,10 +706,10 @@ impl Endpoint { std::fs::remove_dir_all(self.pgdata())?; } - let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); + let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers); assert!(!pageserver_connstring.is_empty()); - let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; + let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?; // check for file remote_extensions_spec.json // if it is present, read it and pass to compute_ctl @@ -735,7 +737,7 @@ impl Endpoint { cluster_id: None, // project ID: not used name: None, // project name: not used state: None, - roles: if create_test_user { + roles: if args.create_test_user { vec![Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, @@ -744,7 +746,7 @@ impl Endpoint { } else { Vec::new() }, - databases: if create_test_user { + databases: if args.create_test_user { vec![Database { name: PgIdent::from_str("neondb").unwrap(), owner: PgIdent::from_str("test").unwrap(), @@ -766,20 +768,21 @@ impl Endpoint { endpoint_id: Some(self.endpoint_id.clone()), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), - safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), + safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()), safekeeper_connstrings, - storage_auth_token: auth_token.clone(), + storage_auth_token: args.auth_token.clone(), remote_extensions, pgbouncer_settings: None, - shard_stripe_size: Some(shard_stripe_size), + shard_stripe_size: Some(args.shard_stripe_size), local_proxy_config: None, reconfigure_concurrency: self.reconfigure_concurrency, drop_subscriptions_before_start: self.drop_subscriptions_before_start, audit_log_level: ComputeAudit::Disabled, logs_export_host: None::, - endpoint_storage_addr: Some(endpoint_storage_addr), - endpoint_storage_token: Some(endpoint_storage_token), - autoprewarm: false, + endpoint_storage_addr: Some(args.endpoint_storage_addr), + endpoint_storage_token: Some(args.endpoint_storage_token), + autoprewarm: args.autoprewarm, + offload_lfc_interval_seconds: args.offload_lfc_interval_seconds, suspend_timeout_seconds: -1, // Only used in neon_local. }; @@ -791,7 +794,7 @@ impl Endpoint { debug!("spec.cluster {:?}", spec.cluster); // fill missing fields again - if create_test_user { + if args.create_test_user { spec.cluster.roles.push(Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, @@ -826,7 +829,7 @@ impl Endpoint { // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); println!("Starting postgres node at '{conn_str}'"); - if create_test_user { + if args.create_test_user { let conn_str = self.connstr("test", "neondb"); println!("Also at '{conn_str}'"); } @@ -858,11 +861,11 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_base_url) = remote_ext_base_url { - cmd.args(["--remote-ext-base-url", remote_ext_base_url]); + if let Some(remote_ext_base_url) = args.remote_ext_base_url { + cmd.args(["--remote-ext-base-url", &remote_ext_base_url]); } - if dev { + if args.dev { cmd.arg("--dev"); } @@ -894,10 +897,11 @@ impl Endpoint { Ok(state) => { match state.status { ComputeStatus::Init => { - if Instant::now().duration_since(start_at) > start_timeout { + let timeout = args.start_timeout; + if Instant::now().duration_since(start_at) > timeout { bail!( "compute startup timed out {:?}; still in Init state", - start_timeout + timeout ); } // keep retrying @@ -925,9 +929,10 @@ impl Endpoint { } } Err(e) => { - if Instant::now().duration_since(start_at) > start_timeout { + if Instant::now().duration_since(start_at) > args.start_timeout { return Err(e).context(format!( - "timed out {start_timeout:?} waiting to connect to compute_ctl HTTP", + "timed out {:?} waiting to connect to compute_ctl HTTP", + args.start_timeout )); } } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 0036b7d0f6..701c4b3b2e 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -65,12 +65,27 @@ enum Command { #[arg(long)] scheduling: Option, }, - // Set a node status as deleted. + /// Exists for backup usage and will be removed in future. + /// Use [`Command::NodeStartDelete`] instead, if possible. NodeDelete { #[arg(long)] node_id: NodeId, }, + /// Start deletion of the specified pageserver. + NodeStartDelete { + #[arg(long)] + node_id: NodeId, + }, + /// Cancel deletion of the specified pageserver and wait for `timeout` + /// for the operation to be canceled. May be retried. + NodeCancelDelete { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + timeout: humantime::Duration, + }, /// Delete a tombstone of node from the storage controller. + /// This is used when we want to allow the node to be re-registered. NodeDeleteTombstone { #[arg(long)] node_id: NodeId, @@ -912,10 +927,43 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::NodeDelete { node_id } => { + eprintln!("Warning: This command is obsolete and will be removed in a future version"); + eprintln!("Use `NodeStartDelete` instead, if possible"); storcon_client .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None) .await?; } + Command::NodeStartDelete { node_id } => { + storcon_client + .dispatch::<(), ()>( + Method::PUT, + format!("control/v1/node/{node_id}/delete"), + None, + ) + .await?; + println!("Delete started for {node_id}"); + } + Command::NodeCancelDelete { node_id, timeout } => { + storcon_client + .dispatch::<(), ()>( + Method::DELETE, + format!("control/v1/node/{node_id}/delete"), + None, + ) + .await?; + + println!("Waiting for node {node_id} to quiesce on scheduling policy ..."); + + let final_policy = + wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| { + !matches!(sched, NodeSchedulingPolicy::Deleting) + }) + .await?; + + println!( + "Delete was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" + ); + } Command::NodeDeleteTombstone { node_id } => { storcon_client .dispatch::<(), ()>( diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 1e62e91fd0..6f36b4358e 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -54,14 +54,16 @@ else printf '%s\n' "${result}" | jq . fi - echo "Check if a timeline present" - PARAMS=( - -X GET - -H "Content-Type: application/json" - "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" - ) - timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) - if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then + if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then + echo "Check if a timeline present" + PARAMS=( + -X GET + -H "Content-Type: application/json" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline" + ) + timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id) + fi + if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then generate_id timeline_id PARAMS=( -sbf diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 2519b75c7f..19c3bc74e2 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -142,7 +142,7 @@ services: - "storage_broker" - "--listen-addr=0.0.0.0:50051" - compute: + compute1: restart: always build: context: ./compute_wrapper/ @@ -152,6 +152,7 @@ services: - TAG=${COMPUTE_TAG:-${TAG:-latest}} - http_proxy=${http_proxy:-} - https_proxy=${https_proxy:-} + image: built-compute environment: - PG_VERSION=${PG_VERSION:-16} - TENANT_ID=${TENANT_ID:-} @@ -166,6 +167,11 @@ services: - 3080:3080 # http endpoints entrypoint: - "/shell/compute.sh" + # Ad an alias for compute1 for compatibility + networks: + default: + aliases: + - compute depends_on: - safekeeper1 - safekeeper2 @@ -174,15 +180,20 @@ services: compute_is_ready: image: postgres:latest + environment: + - PARALLEL_COMPUTES=1 entrypoint: - - "/bin/bash" + - "/bin/sh" - "-c" command: - - "until pg_isready -h compute -p 55433 -U cloud_admin ; do - echo 'Waiting to start compute...' && sleep 1; - done" + - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do + until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do + sleep 1; + done; + done; + echo All computes are started" depends_on: - - compute + - compute1 neon-test-extensions: profiles: ["test-extensions"] @@ -196,4 +207,4 @@ services: command: - sleep 3600 depends_on: - - compute + - compute1 diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 6edf90ca8d..063b8dee85 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # A basic test to ensure Docker images are built correctly. # Build a wrapper around the compute, start all services and runs a simple SQL query. @@ -13,9 +13,36 @@ # set -eux -o pipefail +cd "$(dirname "${0}")" export COMPOSE_FILE='docker-compose.yml' export COMPOSE_PROFILES=test-extensions -cd "$(dirname "${0}")" +export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1} +READY_MESSAGE="All computes are started" +COMPUTES=() +for i in $(seq 1 "${PARALLEL_COMPUTES}"); do + COMPUTES+=("compute${i}") +done +CURRENT_TMPDIR=$(mktemp -d) +trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT +if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then + export COMPOSE_FILE=docker-compose-parallel.yml + cp docker-compose.yml docker-compose-parallel.yml + # Replace the environment variable PARALLEL_COMPUTES with the actual value + yq eval -i ".services.compute_is_ready.environment |= map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE} + for i in $(seq 2 "${PARALLEL_COMPUTES}"); do + # Duplicate compute1 as compute${i} for parallel execution + yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE} + # We don't need these sections, so delete them + yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE} + # Let the compute 1 be the only dependence + yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE} + # Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes + yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE} + # Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes + # They will create new TENANT_ID and TIMELINE_ID anyway. + yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE} + done +fi PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" function cleanup() { @@ -27,11 +54,11 @@ function cleanup() { for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do pg_version=${pg_version/v/} - echo "clean up containers if exists" + echo "clean up containers if exist" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d - + PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1 + PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 while sleep 3; do @@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "timeout before the compute is ready." exit 1 fi - if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then + if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then echo "OK. The compute is ready to connect." echo "execute simple queries." - docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" + for compute in "${COMPUTES[@]}"; do + docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'" + done break fi done if [[ ${pg_version} -ge 16 ]]; then - # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail - # It cannot be moved to Dockerfile now because the database directory is created after the start of the container - echo Adding dummy config - docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf - # Prepare for the PostGIS test - docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}" - docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}" - docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install - docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test - docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress - rm -rf "${TMPDIR}" - # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data" - docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/ - rm -rf "${TMPDIR}" - # The following block does the same for the contrib/file_fdw test - TMPDIR=$(mktemp -d) - docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data" - docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data - rm -rf "${TMPDIR}" + mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src} + docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test" + docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install" + docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data" + docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data" + + for compute in "${COMPUTES[@]}"; do + # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail + # It cannot be moved to Dockerfile now because the database directory is created after the start of the container + echo Adding dummy config on "${compute}" + docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # Prepare for the PostGIS test + docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install + docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test + docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress + # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment + docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/ + # The following block does the same for the contrib/file_fdw test + docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data + done # Apply patches docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch" # We are running tests now rm -f testout.txt testout_contrib.txt + # We want to run the longest tests first to better utilize parallelization and reduce overall test time. + # Tests listed in the RUN_FIRST variable will be run before others. + # If parallelization is not used, this environment variable will be ignored. + docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \ + -e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0 docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \ + -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \ neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0 if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then CONTRIB_FAILED= diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 930402ce66..b37b9363fa 100755 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -x if [[ -v BENCHMARK_CONNSTR ]]; then @@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then fi fi REGULAR_USER=false -while getopts r arg; do - case $arg in +PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1} +while getopts pr arg; do + case ${arg} in r) REGULAR_USER=true shift $((OPTIND-1)) @@ -41,26 +42,49 @@ extdir=${1} cd "${extdir}" || exit 2 FAILED= -LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) -for d in ${LIST}; do - [ -d "${d}" ] || continue - if ! psql -w -c "select 1" >/dev/null; then - FAILED="${d} ${FAILED}" - break - fi - if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then - "${d}/regular-test.sh" || FAILED="${d} ${FAILED}" - continue - fi +export FAILED_FILE=/tmp/failed +rm -f ${FAILED_FILE} +mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u) +if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then + # Avoid errors if RUN_FIRST is not defined + RUN_FIRST=${RUN_FIRST:-} + # Move entries listed in the RUN_FIRST variable to the beginning + ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}")) + parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0 + export PGHOST=compute{%} + if ! psql -c 'select 1'>/dev/null; then + exit 1 + fi + echo Running on \${PGHOST} + if [[ -f ${extdir}/{}/neon-test.sh ]]; then + echo Running from script + ${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE}; + else + echo Running using make; + USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE}; + fi" ::: ${ORDERED_LIST} + [[ ! -f ${FAILED_FILE} ]] && exit 0 +else + for d in "${LIST[@]}"; do + [ -d "${d}" ] || continue + if ! psql -w -c "select 1" >/dev/null; then + FAILED="${d} ${FAILED}" + break + fi + if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then + "${d}/regular-test.sh" || FAILED="${d} ${FAILED}" + continue + fi - if [ -f "${d}/neon-test.sh" ]; then - "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" - else - USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" - fi -done -[ -z "${FAILED}" ] && exit 0 -for d in ${FAILED}; do + if [ -f "${d}/neon-test.sh" ]; then + "${d}/neon-test.sh" || FAILED="${d} ${FAILED}" + else + USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}" + fi + done + [[ -z ${FAILED} ]] && exit 0 +fi +for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do cat "$(find $d -name regression.diffs)" done for postgis_diff in /tmp/pgis_reg/*_diff; do @@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do cat "${postgis_diff}" done echo "${FAILED}" +cat ${FAILED_FILE} exit 1 diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh index f1cf17f531..1d39fc029e 100755 --- a/docker-compose/test_extensions_upgrade.sh +++ b/docker-compose/test_extensions_upgrade.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -eux -o pipefail cd "$(dirname "${0}")" # Takes a variable name as argument. The result is stored in that variable. @@ -60,8 +60,8 @@ function check_timeline() { # Restarts the compute node with the required compute tag and timeline. # Accepts the tag for the compute node and the timeline as parameters. function restart_compute() { - docker compose down compute compute_is_ready - COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready + docker compose down compute1 compute_is_ready + COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready wait_for_ready check_timeline ${2} } diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md index 9b320c7285..8619f83ff5 100644 --- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md +++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md @@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake up all computes for the change. Neither we want to fully reimplement the leader logic second time outside compute. Because of that the proposed algorithm relies for issuing configurations on the external fault tolerant (distributed) strongly -consisent storage with simple API: CAS (compare-and-swap) on the single key. +consistent storage with simple API: CAS (compare-and-swap) on the single key. Properly configured postgres suits this. In the system consensus is implemented at the timeline level, so algorithm below @@ -34,7 +34,7 @@ A configuration is ``` struct Configuration { - generation: Generation, // a number uniquely identifying configuration + generation: SafekeeperGeneration, // a number uniquely identifying configuration sk_set: Vec, // current safekeeper set new_sk_set: Optional>, } @@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In response it sends its current configuration generation to let walproposer know. -Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` -accepting `Configuration`. Safekeeper switches to the given conf it is higher than its +Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership` +accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its current one and ignores it otherwise. In any case it replies with ``` -struct ConfigurationSwitchResponse { +struct TimelineMembershipSwitchResponse { conf: Configuration, term: Term, last_log_term: Term, @@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting. It should stop talking to safekeepers not listed in the configuration at this point, though it is not unsafe to continue doing so. -To be elected it must receive votes from both majorites if `new_sk_set` is present. +To be elected it must receive votes from both majorities if `new_sk_set` is present. Similarly, to commit WAL it must receive flush acknowledge from both majorities. If walproposer hears from safekeeper configuration higher than his own (i.e. @@ -130,7 +130,7 @@ storage are reachable. 1) Fetch current timeline configuration from the configuration storage. 2) If it is already joint one and `new_set` is different from `desired_set` refuse to change. However, assign join conf to (in memory) var - `join_conf` and proceed to step 4 to finish the ongoing change. + `joint_conf` and proceed to step 4 to finish the ongoing change. 3) Else, create joint `joint_conf: Configuration`: increment current conf number `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration storage by doing CAS on the current generation: change happens only if @@ -161,11 +161,11 @@ storage are reachable. because `pull_timeline` already includes it and plus additionally would be broadcast by compute. More importantly, we may proceed to the next step only when `` on the majority of the new set reached - `sync_position`. Similarly, on the happy path no waiting is not needed because + `sync_position`. Similarly, on the happy path no waiting is needed because `pull_timeline` already includes it. However, we should double check to be safe. For example, timeline could have been created earlier e.g. manually or after try-to-migrate, abort, try-to-migrate-again sequence. -7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new +7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration storage under one more CAS. 8) Call `PUT` `configuration` on safekeepers from the new set, @@ -178,12 +178,12 @@ spec of it. Description above focuses on safety. To make the flow practical and live, here a few more considerations. -1) It makes sense to ping new set to ensure it we are migrating to live node(s) before +1) It makes sense to ping new set to ensure we are migrating to live node(s) before step 3. 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed it is safe to rollback to the old conf with one more CAS. 3) On step 4 timeline might be already created on members of the new set for various reasons; - the simplest is the procedure restart. There are more complicated scenarious like mentioned + the simplest is the procedure restart. There are more complicated scenarios like mentioned in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving generations, so seems simpler to treat existing timeline as success. However, this also has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in @@ -192,7 +192,7 @@ considerations. 4) In the end timeline should be locally deleted on the safekeeper(s) which are in the old set but not in the new one, unless they are unreachable. To be safe this also should be done under generation number (deletion proceeds only if - current configuration is <= than one in request and safekeeper is not memeber of it). + current configuration is <= than one in request and safekeeper is not member of it). 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`, jump to step 7, using it as `new_conf`. @@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST Response should be augmented with `safekeepers_generation` and `safekeepers` fields like described in `/notify-safekeepers` above. Initially (currently) these fields may be absent; in this case cplane chooses safekeepers on its own -like it currently does. The call should be retried until succeeds. +like it currently does. The call should be retried until it succeeds. Timeline deletion and tenant deletion in cplane should call appropriate storage_controller endpoints like it currently does for sharded tenants. The calls should be retried until they succeed. -When compute receives safekeepers list from control plane it needs to know the -generation to checked whether it should be updated (note that compute may get +When compute receives safekeeper list from control plane it needs to know the +generation to check whether it should be updated (note that compute may get safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers` GUC is just a comma separates list of `host:port`. Let's prefix it with `g#:` to this end, so it will look like @@ -305,8 +305,8 @@ enum MigrationRequest { ``` `FinishPending` requests to run the procedure to ensure state is clean: current -configuration is not joint and majority of safekeepers are aware of it, but do -not attempt to migrate anywhere. If current configuration fetched on step 1 is +configuration is not joint and the majority of safekeepers are aware of it, but do +not attempt to migrate anywhere. If the current configuration fetched on step 1 is not joint it jumps to step 7. It should be run at startup for all timelines (but similarly, in the first version it is ok to trigger it manually). @@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually). `safekeepers` table mirroring current `nodes` should be added, except that for `scheduling_policy`: it is enough to have at least in the beginning only 3 fields: 1) `active` 2) `paused` (initially means only not assign new tlis there -3) `decomissioned` (node is removed). +3) `decommissioned` (node is removed). `timelines` table: ``` @@ -326,9 +326,10 @@ table! { tenant_id -> Varchar, start_lsn -> pg_lsn, generation -> Int4, - sk_set -> Array, // list of safekeeper ids + sk_set -> Array, // list of safekeeper ids new_sk_set -> Nullable>, // list of safekeeper ids, null if not joint conf cplane_notified_generation -> Int4, + sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about deleted_at -> Nullable, } } @@ -338,13 +339,23 @@ table! { might also want to add ancestor_timeline_id to preserve the hierarchy, but for this RFC it is not needed. +`cplane_notified_generation` and `sk_set_notified_generation` fields are used to +track the last stage of the algorithm, when we need to notify safekeeper set and cplane +with the final configuration after it's already committed to DB. + +The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and +`*_notified_generation` fields are up to date with `generation`. + +It's possible to replace `*_notified_generation` with one boolean field `migration_completed`, +but for better observability it's nice to have them separately. + #### API Node management is similar to pageserver: -1) POST `/control/v1/safekeepers` inserts safekeeper. -2) GET `/control/v1/safekeepers` lists safekeepers. -3) GET `/control/v1/safekeepers/:node_id` gets safekeeper. -4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g. +1) POST `/control/v1/safekeeper` inserts safekeeper. +2) GET `/control/v1/safekeeper` lists safekeepers. +3) GET `/control/v1/safekeeper/:node_id` gets safekeeper. +4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g. `offline` or `decomissioned`. Initially it is simpler not to schedule any migrations here. @@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative: all timelines from one safekeeper to another. It accepts json ``` { - "src_sk": u32, - "dst_sk": u32, + "src_sk": NodeId, + "dst_sk": NodeId, "limit": Optional, } ``` @@ -379,12 +390,15 @@ Returns list of scheduled requests. 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest` to move single timeline to given set of safekeepers: ``` -{ - "desired_set": Vec, +struct TimelineSafekeeperMigrateRequest { + "new_sk_set": Vec, } ``` -Returns scheduled request. +In the first version the handler migrates the timeline to `new_sk_set` synchronously. +Should be retried until success. + +In the future we might change it to asynchronous API and return scheduled request. Similar call should be added for the tenant. @@ -434,6 +448,9 @@ table! { } ``` +We load all pending ops from the table on startup into the memory. +The table is needed only to preserve the state between restarts. + `op_type` can be `include` (seed from peers and ensure generation is up to date), `exclude` (remove locally) and `delete`. Field is actually not strictly needed as it can be computed from current configuration, but gives more explicit @@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For the initial (tenant creation) call cplane doesn't know it. However, setting start_lsn on safekeepers during creation is a good thing -- it provides a guarantee that walproposer can always find a common point in WAL histories of -safekeeper and its own, and so absense of it would be a clear sign of +safekeeper and its own, and so absence of it would be a clear sign of corruption. The following sequence works: 1) Create timeline (or observe that it exists) on pageserver, figuring out last_record_lsn in response. @@ -497,11 +514,9 @@ corruption. The following sequence works: retries the call until 200 response. There is a small question how request handler (timeline creation in this - case) would interact with per sk reconciler. As always I prefer to do the - simplest possible thing and here it seems to be just waking it up so it - re-reads the db for work to do. Passing work in memory is faster, but - that shouldn't matter, and path to scan db for work will exist anyway, - simpler to reuse it. + case) would interact with per sk reconciler. In the current implementation + we first persist the request in the DB, and then send an in-memory request + to each safekeeper reconciler to process it. For pg version / wal segment size: while we may persist them in `timelines` table, it is not necessary as initial creation at step 3 can take them from @@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them around. Timeline migration. -1) CAS to the db to create joint conf, and in the same transaction create - `safekeeper_timeline_pending_ops` `include` entries to initialize new members - as well as deliver this conf to current ones; poke per sk reconcilers to work - on it. Also any conf change should also poke cplane notifier task(s). -2) Once it becomes possible per alg description above, get out of joint conf - with another CAS. Task should get wakeups from per sk reconcilers because - conf switch is required for advancement; however retries should be sleep - based as well as LSN advancement might be needed, though in happy path - it isn't. To see whether further transition is possible on wakup migration - executor polls safekeepers per the algorithm. CAS creating new conf with only - new members should again insert entries to `safekeeper_timeline_pending_ops` - to switch them there, as well as `exclude` rows to remove timeline from - old members. +1) CAS to the db to create joint conf. Since this moment the migration is considered to be + "in progress". We can detect all "in-progress" migrations looking into the database. +2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership + configuration on all safekeepers, notify cplane, etc. All operations are idempotent, + so we don't need to persist anything in the database at this stage. If any errors occur, + it's safe to retry or abort the migration. +3) Once it becomes possible per alg description above, get out of joint conf + with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops` + in the same DB transaction. Adding `exclude` entries atomically is nesessary because after + CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we + need to have them persisted somewhere in case the migration is interrupted right after the CAS. +4) Finish the migration. The final membership configuration is committed to the DB at this stage. + So, the migration can not be aborted anymore. But it can still be retried if the migration fails + past stage 3. To finish the migration we need to send the new membership configuration to + a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude` + requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's + possible that we have already committed `exclude` requests to DB, but didn't send them to + the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops` + because it's the only place where they are persistent. The fields `sk_set_notified_generation` + and `cplane_notified_generation` are updated after each step. The migration is considered + fully completed when they match the `generation` field. + +In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline +reconciler (if we implement it). But it's wise to at least try to finish them synchronously, +so the timeline is always in a "good state" and doesn't require an old quorum to commit +WAL after the migration reported "success". Timeline deletion: just set `deleted_at` on the timeline row and insert `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by per sk reconcilers. -When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops` +When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops` for it must be cleared in the same transaction. -One more task pool should infinitely retry notifying control plane about changed -safekeeper sets (trying making `cplane_notified_generation` equal `generation`). - #### Dealing with multiple instances of storage_controller Operations described above executed concurrently might create some errors but do @@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy. To harden against some controller instance creating some work in `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up -the job per sk reconcilers apart from explicit wakups should scan for work +the job per sk reconcilers apart from explicit wakeups should scan for work periodically. It is possible to remove that though if all db updates are protected with leadership token/term -- then such scans are needed only after leadership is acquired. @@ -563,7 +588,7 @@ There should be following layers of tests: safekeeper communication and pull_timeline need to be mocked and main switch procedure wrapped to as a node (thread) in simulation tests, using these mocks. Test would inject migrations like it currently injects - safekeeper/walproposer restars. Main assert is the same -- committed WAL must + safekeeper/walproposer restarts. Main assert is the same -- committed WAL must not be lost. 3) Since simulation testing injects at relatively high level points (not @@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout: `notify-safekeepers`. Then the rollout for a region would be: -- Current situation: safekeepers are choosen by control_plane. +- Current situation: safekeepers are chosen by control_plane. - We manually migrate some timelines, test moving them around. - Then we enable `--set-safekeepers` so that all new timelines are on storage controller. diff --git a/docs/rfcs/044-feature-flag.md b/docs/rfcs/044-feature-flag.md new file mode 100644 index 0000000000..3a0fe91a13 --- /dev/null +++ b/docs/rfcs/044-feature-flag.md @@ -0,0 +1,179 @@ +# Storage Feature Flags + +In this RFC, we will describe how we will implement per-tenant feature flags. + +## PostHog as Feature Flag Service + +Before we start, let's talk about how current feature flag services work. PostHog is the feature flag service we are currently using across multiple user-facing components in the company. PostHog has two modes of operation: HTTP evaluation and server-side local evaluation. + +Let's assume we have a storage feature flag called gc-compaction and we want to roll it out to scale-tier users with resident size >= 10GB and <= 100GB. + +### Define User Profiles + +The first step is to synchronize our user profiles to the PostHog service. We can simply assume that each tenant is a user in PostHog. Each user profile has some properties associated with it. In our case, it will be: plan type (free, scale, enterprise, etc); resident size (in bytes); primary pageserver (string); region (string). + +### Define Feature Flags + +We would create a feature flag called gc-compaction in PostHog with 4 variants: disabled, stage-1, stage-2, fully-enabled. We will flip the feature flags from disabled to fully-enabled stage by stage for some percentage of our users. + +### Option 1: HTTP Evaluation Mode + +When using PostHog's HTTP evaluation mode, the client will make request to the PostHog service, asking for the value of a feature flag for a specific user. + +* Control plane will report the plan type to PostHog each time it attaches a tenant to the storcon or when the user upgrades/downgrades. It calls the PostHog profile API to associate tenant ID with the plan type. Assume we have X active tenants and such attach or plan change event happens each week, that would be 4X profile update requests per month. +* Pageservers will report the resident size and the primary pageserver to the PostHog service. Assume we report resident size every 24 hours, that would be 30X requests per month. +* Each tenant will request the state of the feature flag every 1 hour, that's 720X requests per month. +* The Rust client would be easy to implement as we only need to call the `/decide` API on PostHog. + +Using the HTTP evaluation mode we will issue 754X requests a month. + +### Option 2: Local Evaluation Mode + +When using PostHog's HTTP evaluation mode, the client (usually the server in a browser/server architecture) will poll the feature flag configuration every 30s (default in the Python client) from PostHog. Such configuration contains data like: + +
+ +Example JSON response from the PostHog local evaluation API + +``` +[ + { + "id": 1, + "name": "Beta Feature", + "key": "person-flag", + "is_simple_flag": True, + "active": True, + "filters": { + "groups": [ + { + "properties": [ + { + "key": "location", + "operator": "exact", + "value": ["Straße"], + "type": "person", + } + ], + "rollout_percentage": 100, + }, + { + "properties": [ + { + "key": "star", + "operator": "exact", + "value": ["ſun"], + "type": "person", + } + ], + "rollout_percentage": 100, + }, + ], + }, + } +] +``` + +
+ +Note that the API only contains information like "under what condition => rollout percentage". The user is responsible to provide the properties required to the client for local evaluation, and the PostHog service (web UI) cannot know if a feature is enabled for the tenant or not until the client uses the `capture` API to report the result back. To control the rollout percentage, the user ID gets mapped to a float number in `[0, 1)` on a consistent hash ring. All values <= the percentage will get the feature enabled or set to the desired value. + +To use the local evaluation mode, the system needs: + +* Assume each pageserver will poll PostHog for the local evaluation JSON every 5 minutes (instead of the 30s default as it's too frequent). That's 8640Y per month, Y is the number of pageservers. Local evaluation requests cost 10x more than the normal decide request, so that's 86400Y request units to bill. +* Storcon needs to store the plan type in the database and pass that information to the pageserver when attaching the tenant. +* Storcon also needs to update PostHog with the active tenants, for example, when the tenant gets detached/attached. Assume each active tenant gets detached/attached every week, that would be 4X requests per month. +* We do not need to update bill type or resident size to PostHog as all these are evaluated locally. +* After each local evaluation of the feature flag, we need to call PostHog's capture event API to update the result of the evaluation that the feature is enabled. We can do this when the flag gets changed compared with the last cached state in memory. That would be at least 4X (assume we do deployment every week so the cache gets cleared) and maybe an additional multiplifier of 10 assume we have 10 active features. + +In this case, we will issue 86400Y + 40X requests per month. + +Assume X = 1,000,000 and Y = 100, + +| | HTTP Evaluation | Local Evaluation | +|---|---|---| +| Latency of propagating the conditions/properties for feature flag | 24 hours | available locally | +| Latency of applying the feature flag | 1 hour | 5 minutes | +| Can properties be reported from different services | Yes | No | +| Do we need to sync billing info etc to pageserver | No | Yes | +| Cost | 75400$ / month | 4864$ / month | + +# Our Solution + +We will use PostHog _only_ as an UI to configure the feature flags. Whether a feature is enabled or not can only be queried through storcon/pageserver instead of using the PostHog UI. (We could report it back to PostHog via `capture_event` but it costs $$$.) This allows us to ramp up the feature flag functionality fast at first. At the same time, it would also give us the option to migrate to our own solution once we want to have more properties and more complex evaluation rules in our system. + +* We will create several fake users (tenants) in PostHog that contains all the properties we will use for evaluating a feature flag (i.e., resident size, billing type, pageserver id, etc.) +* We will use PostHog's local evaluation API to poll the configuration of the feature flags and evaluate them locally on each of the pageserver. +* The evaluation result will not be reported back to PostHog. +* Storcon needs to pull some information from cplane database. +* To know if a feature is currently enabled or not, we need to call the storcon/pageserver API; and we won't be able to know if a feature has been enabled on a tenant before easily: we need to look at the Grafana logs. + +We only need to pay for the 86400Y local evaluation requests (that would be setting Y=0 in solution 2 => $864/month, and even less if we proxy it through storcon). + +## Implementation + +* Pageserver: implement a PostHog local evaluation client. The client will be shared across all tenants on the pageserver with a single API: `evaluate(tenant_id, feature_flag, properties) -> json`. +* Storcon: if we need plan type as the evaluation condition, pull it from cplane database. +* Storcon/Pageserver: implement an HTTP API `:tenant_id/feature/:feature` to retrieve the current feature flag status. +* Storcon/Pageserver: a loop to update the feature flag spec on both storcon and pageserver. Pageserver loop will only be activated if storcon does not push the specs to the pageserver. + +## Difference from Tenant Config + +* Feature flags can be modified by percentage, and the default config for each feature flag can be modified in UI without going through the release process. +* Feature flags are more flexible and won't be persisted anywhere and will be passed as plain JSON over the wire so that do not need to handle backward/forward compatibility as in tenant config. +* The expectation of tenant config is that once we add a flag we cannot remove it (or it will be hard to remove), but feature flags are more flexible. + +# Final Implementation + +* We added a new crate `posthog_lite_client` that supports local feature evaluations. +* We set up two projects "Storage (staging)" and "Storage (production)" in the PostHog console. +* Each pageserver reports 10 fake tenants to PostHog so that we can get all combinations of regions (and other properties) in the PostHog UI. +* Supported properties: AZ, neon_region, pageserver, tenant_id. +* You may use "Pageserver Feature Flags" dashboard to see the evaluation status. +* The feature flag spec is polled on storcon every 30s (in each of the region) and storcon will propagate the spec to the pageservers. +* The pageserver housekeeping loop updates the tenant-specific properties (e.g., remote size) for evaluation. + +Each tenant has a `feature_resolver` object. After you add a feature flag in the PostHog console, you can retrieve it with: + +```rust +// Boolean flag +self + .feature_resolver + .evaluate_boolean("flag") + .is_ok() +// Multivariate flag +self + .feature_resolver + .evaluate_multivariate("gc-comapction-strategy") + .ok(); +``` + +The user needs to handle the case where the evaluation result is an error. This can occur in a variety of cases: + +* During the pageserver start, the feature flag spec has not been retrieved. +* No condition group is matched. +* The feature flag spec contains an operand/operation not supported by the lite PostHog library. + +For boolean flags, the return value is `Result<(), Error>`. `Ok(())` means the flag is evaluated to true. Otherwise, +there is either an error in evaluation or it does not match any groups. + +For multivariate flags, the return value is `Result`. `Ok(variant)` indicates the flag is evaluated +to a variant. Otherwise, there is either an error in evaluation or it does not match any groups. + +The evaluation logic is documented in the PostHog lite library. It compares the consistent hash of a flag key + tenant_id +with the rollout percentage and determines which tenant to roll out a specific feature. + +Users can use the feature flag evaluation API to get the flag evaluation result of a specific tenant for debugging purposes. + +``` +curl http://localhost:9898/v1/tenant/:tenant_id/feature_flag?flag=:key&as=multivariate/boolean" +``` + +By default, the storcon pushes the feature flag specs to the pageservers every 30 seconds, which means that a change in feature flag in the +PostHog UI will propagate to the pageservers within 30 seconds. + +# Future Works + +* Support dynamic tenant properties like logical size as the evaluation condition. +* Support properties like `plan_type` (needs cplane to pass it down). +* Report feature flag evaluation result back to PostHog (if the cost is okay). +* Fast feature flag evaluation cache on critical paths (e.g., cache a feature flag result in `AtomicBool` and use it on the read path). \ No newline at end of file diff --git a/docs/rfcs/2025-03-17-compute-prewarm.md b/docs/rfcs/2025-03-17-compute-prewarm.md new file mode 100644 index 0000000000..6e95b9ac39 --- /dev/null +++ b/docs/rfcs/2025-03-17-compute-prewarm.md @@ -0,0 +1,399 @@ +# Compute rolling restart with prewarm + +Created on 2025-03-17 +Implemented on _TBD_ +Author: Alexey Kondratov (@ololobus) + +## Summary + +This RFC describes an approach to reduce performance degradation due to missing caches after compute node restart, i.e.: + +1. Rolling restart of the running instance via 'warm' replica. +2. Auto-prewarm compute caches after unplanned restart or scale-to-zero. + +## Motivation + +Neon currently implements several features that guarantee high uptime of compute nodes: + +1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure. +2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast. +3. Preemptive NeonVM compute provisioning in case of k8s node unavailability. + +This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes. +During restart, compute loses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches, +so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to +medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances. + +## Non Goals + +- Details of the persistence storage for prewarm data are out of scope, there is a separate RFC for that: . +- Complete compute/Postgres HA setup and flow. Although it was originally in scope of this RFC, during preliminary research it appeared to be a rabbit hole, so it's worth of a separate RFC. +- Low-level implementation details for Postgres replica-to-primary promotion. There are a lot of things to think and care about: how to start walproposer, [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html), and so on, but it's worth of at least a separate one-pager design document if not RFC. + +## Impacted components + +Postgres, compute_ctl, Control plane, Endpoint storage for unlogged storage of compute files. +For the latter, we will need to implement a uniform abstraction layer on top of S3, ABS, etc., but +S3 is used in text interchangeably with 'endpoint storage' for simplicity. + +## Proposed implementation + +### compute_ctl spec changes and auto-prewarm + +We are going to extend the current compute spec with the following attributes + +```rust +struct ComputeSpec { + /// [All existing attributes] + ... + /// Whether to do auto-prewarm at start or not. + /// Default to `false`. + pub lfc_auto_prewarm: bool + /// Interval in seconds between automatic dumps of + /// LFC state into S3. Default `None`, which means 'off'. + pub lfc_dump_interval_sec: Option +} +``` + +When `lfc_dump_interval_sec` is set to `N`, `compute_ctl` will periodically dump the LFC state +and store it in S3, so that it could be used either for auto-prewarm after restart or by replica +during the rolling restart. For enabling periodic dumping, we should consider the following value +`lfc_dump_interval_sec=300` (5 minutes), same as in the upstream's `pg_prewarm.autoprewarm_interval`. + +When `lfc_auto_prewarm` is set to `true`, `compute_ctl` will start prewarming the LFC upon restart +iif some of the previous states is present in S3. + +### compute_ctl API + +1. `POST /store_lfc_state` -- dump LFC state using Postgres SQL interface and store result in S3. + This has to be a blocking call, i.e. it will return only after the state is stored in S3. + If there is any concurrent request in progress, we should return `429 Too Many Requests`, + and let the caller to retry. + +2. `GET /dump_lfc_state` -- dump LFC state using Postgres SQL interface and return it as is + in text format suitable for the future restore/prewarm. This API is not strictly needed at + the end state, but could be useful for a faster prototyping of a complete rolling restart flow + with prewarm, as it doesn't require persistent for LFC state storage. + +3. `POST /restore_lfc_state` -- restore/prewarm LFC state with request + + ```yaml + RestoreLFCStateRequest: + oneOf: + - type: object + required: + - lfc_state + properties: + lfc_state: + type: string + description: Raw LFC content dumped with GET `/dump_lfc_state` + - type: object + required: + - lfc_cache_key + properties: + lfc_cache_key: + type: string + description: | + endpoint_id of the source endpoint on the same branch + to use as a 'donor' for LFC content. Compute will look up + LFC content dump in S3 using this key and do prewarm. + ``` + + where `lfc_state` and `lfc_cache_key` are mutually exclusive. + + The actual prewarming will happen asynchronously, so the caller need to check the + prewarm status using the compute's standard `GET /status` API. + +4. `GET /status` -- extend existing API with following attributes + + ```rust + struct ComputeStatusResponse { + // [All existing attributes] + ... + pub prewarm_state: PrewarmState + } + + /// Compute prewarm state. Will be stored in the shared Compute state + /// in compute_ctl + struct PrewarmState { + pub status: PrewarmStatus + /// Total number of pages to prewarm + pub pages_total: i64 + /// Number of pages prewarmed so far + pub pages_processed: i64 + /// Optional prewarm error + pub error: Option + } + + pub enum PrewarmStatus { + /// Prewarming was never requested on this compute + Off, + /// Prewarming was requested, but not started yet + Pending, + /// Prewarming is in progress. The caller should follow + /// `PrewarmState::progress`. + InProgress, + /// Prewarming has been successfully completed + Completed, + /// Prewarming failed. The caller should look at + /// `PrewarmState::error` for the reason. + Failed, + /// It is intended to be used by auto-prewarm if none of + /// the previous LFC states is available in S3. + /// This is a distinct state from the `Failed` because + /// technically it's not a failure and could happen if + /// compute was restart before it dumped anything into S3, + /// or just after the initial rollout of the feature. + Skipped, + } + ``` + +5. `POST /promote` -- this is a **blocking** API call to promote compute replica into primary. + This API should be very similar to the existing `POST /configure` API, i.e. accept the + spec (primary spec, because originally compute was started as replica). It's a distinct + API method because semantics and response codes are different: + + - If promotion is done successfully, it will return `200 OK`. + - If compute is already primary, the call will be no-op and `compute_ctl` + will return `412 Precondition Failed`. + - If, for some reason, second request reaches compute that is in progress of promotion, + it will respond with `429 Too Many Requests`. + - If compute hit any permanent failure during promotion `500 Internal Server Error` + will be returned. + +### Control plane operations + +The complete flow will be present as a sequence diagram in the next section, but here +we just want to list some important steps that have to be done by control plane during +the rolling restart via warm replica, but without much of low-level implementation details. + +1. Register the 'intent' of the instance restart, but not yet interrupt any workload at + primary and also accept new connections. This may require some endpoint state machine + changes, e.g. introduction of the `pending_restart` state. Being in this state also + **mustn't prevent any other operations except restart**: suspend, live-reconfiguration + (e.g. due to notify-attach call from the storage controller), deletion. + +2. Start new replica compute on the same timeline and start prewarming it. This process + may take quite a while, so the same concurrency considerations as in 1. should be applied + here as well. + +3. When warm replica is ready, control plane should: + + 3.1. Terminate the primary compute. Starting from here, **this is a critical section**, + if anything goes off, the only option is to start the primary normally and proceed + with auto-prewarm. + + 3.2. Send cache invalidation message to all proxies, notifying them that all new connections + should request and wait for the new connection details. At this stage, proxy has to also + drop any existing connections to the old primary, so they didn't do stale reads. + + 3.3. Attach warm replica compute to the primary endpoint inside control plane metadata + database. + + 3.4. Promote replica to primary. + + 3.5. When everything is done, finalize the endpoint state to be just `active`. + +### Complete rolling restart flow + +```mermaid + sequenceDiagram + + autonumber + + participant proxy as Neon proxy + + participant cplane as Control plane + + participant primary as Compute (primary) + box Compute (replica) + participant ctl as compute_ctl + participant pg as Postgres + end + + box Endpoint unlogged storage + participant s3proxy as Endpoint storage service + participant s3 as S3/ABS/etc. + end + + + cplane ->> primary: POST /store_lfc_state + primary -->> cplane: 200 OK + + cplane ->> ctl: POST /restore_lfc_state + activate ctl + ctl -->> cplane: 202 Accepted + + activate cplane + cplane ->> ctl: GET /status: poll prewarm status + ctl ->> s3proxy: GET /read_file + s3proxy ->> s3: read file + s3 -->> s3proxy: file content + s3proxy -->> ctl: 200 OK: file content + + proxy ->> cplane: GET /proxy_wake_compute + cplane -->> proxy: 200 OK: old primary conninfo + + ctl ->> pg: prewarm LFC + activate pg + pg -->> ctl: prewarm is completed + deactivate pg + + ctl -->> cplane: 200 OK: prewarm is completed + deactivate ctl + deactivate cplane + + cplane -->> cplane: reassign replica compute to endpoint,
start terminating the old primary compute + activate cplane + cplane ->> proxy: invalidate caches + + proxy ->> cplane: GET /proxy_wake_compute + + cplane -x primary: POST /terminate + primary -->> cplane: 200 OK + note over primary: old primary
compute terminated + + cplane ->> ctl: POST /promote + activate ctl + ctl ->> pg: pg_ctl promote + activate pg + pg -->> ctl: done + deactivate pg + ctl -->> cplane: 200 OK + deactivate ctl + + cplane -->> cplane: finalize operation + cplane -->> proxy: 200 OK: new primary conninfo + deactivate cplane +``` + +### Network bandwidth and prewarm speed + +It's currently known that pageserver can sustain about 3000 RPS per shard for a few running computes. +Large tenants are usually split into 8 shards, so the final formula may look like this: + +```text +8 shards * 3000 RPS * 8 KB =~ 190 MB/s +``` + +so depending on the LFC size, prewarming will take at least: + +- ~5s for 1 GB +- ~50s for 10 GB +- ~5m for 100 GB +- \>1h for 1 TB + +In total, one pageserver is normally capped by 30k RPS, so it obviously can't sustain many computes +doing prewarm at the same time. Later, we may need an additional mechanism for computes to throttle +the prewarming requests gracefully. + +### Reliability, failure modes and corner cases + +We consider following failures while implementing this RFC: + +1. Compute got interrupted/crashed/restarted during prewarm. The caller -- control plane -- should + detect that and start prewarm from the beginning. + +2. Control plane promotion request timed out or hit network issues. If it never reached the + compute, control plane should just repeat it. If it did reach the compute, then during + retry control plane can hit `409` as previous request triggered the promotion already. + In this case, control plane need to retry until either `200` or + permanent error `500` is returned. + +3. Compute got interrupted/crashed/restarted during promotion. At restart it will ask for + a spec from control plane, and its content should signal compute to start as **primary**, + so it's expected that control plane will continue polling for certain period of time and + will discover that compute is ready to accept connections if restart is fast enough. + +4. Any other unexpected failure or timeout during prewarming. This **failure mustn't be fatal**, + control plane has to report failure, terminate replica and keep primary running. + +5. Any other unexpected failure or timeout during promotion. Unfortunately, at this moment + we already have the primary node stopped, so the only option is to start primary again + and proceed with auto-prewarm. + +6. Any unexpected failure during auto-prewarm. This **failure mustn't be fatal**, + `compute_ctl` has to report the failure, but do not crash the compute. + +7. Control plane failed to confirm that old primary has terminated. This can happen, especially + in the future HA setup. In this case, control plane has to ensure that it sent VM deletion + and pod termination requests to k8s, so long-term we do not have two running primaries + on the same timeline. + +### Security implications + +There are two security implications to consider: + +1. Access to `compute_ctl` API. It has to be accessible from the outside of compute, so all + new API methods have to be exposed on the **external** HTTP port and **must** be authenticated + with JWT. + +2. Read/write only your own LFC state data in S3. Although it's not really a security concern, + since LFC state is just a mapping of blocks present in LFC at certain moment in time; + it still has to be highly restricted, so that i) only computes on the same timeline can + read S3 state; ii) each compute can only write to the path that contains it's `endpoint_id`. + Both of this must be validated by Endpoint storage service using the JWT token provided by `compute_ctl`. + +### Unresolved questions + +#### Billing, metrics and monitoring + +Currently, we only label computes with `endpoint_id` after attaching them to the endpoint. +In this proposal, this means that temporary replica will remain unlabelled until it's promoted +to primary. We can also hide it from users in the control plane API, but what to do with +billing and monitoring is still unclear. + +We can probably mark it as 'billable' and tag with `project_id`, so it will be billed, but +not interfere in any way with the current primary monitoring. + +Another thing to consider is how logs and metrics export will switch to the new compute. +It's expected that OpenTelemetry collector will auto-discover the new compute and start +scraping metrics from it. + +#### Auto-prewarm + +It's still an open question whether we need auto-prewarm at all. The author's gut-feeling is +that yes, we need it, but might be not for all workloads, so it could end up exposed as a +user-controllable knob on the endpoint. There are two arguments for that: + +1. Auto-prewarm existing in upstream's `pg_prewarm`, _probably for a reason_. + +2. There are still could be 2 flows when we cannot perform the rolling restart via the warm + replica: i) any failure or interruption during promotion; ii) wake up after scale-to-zero. + The latter might be challenged as well, i.e. one can argue that auto-prewarm may and will + compete with user-workload for storage resources. This is correct, but it might as well + reduce the time to get warm LFC and good performance. + +#### Low-level details of the replica promotion + +There are many things to consider here, but three items just off the top of my head: + +1. How to properly start the `walproposer` inside Postgres. + +2. What to do with logical replication. Currently, we do not include logical replication slots + inside basebackup, because nobody advances them at replica, so they just prevent the WAL + deletion. Yet, we do need to have them at primary after promotion. Starting with Postgres 17, + there is a new feature called + [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html) + and `synchronized_standby_slots` setting, but we need a plan for the older versions. Should we + request a new basebackup during promotion? + +3. How do we guarantee that replica will receive all the latest WAL from safekeepers? Do some + 'shallow' version of sync safekeepers without data copying? Or just a standard version of + sync safekeepers? + +## Alternative implementation + +The proposal already assumes one of the alternatives -- do not have any persistent storage for +LFC state. This is possible to implement faster with the proposed API, but it means that +we do not implement auto-prewarm yet. + +## Definition of Done + +At the end of implementing this RFC we should have two high-level settings that enable: + +1. Auto-prewarm of user computes upon restart. +2. Perform primary compute restart via the warm replica promotion. + +It also has to be decided what's the criteria for enabling one or both of these flows for +certain clients. diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 42431c0066..a7a18743ef 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -13,6 +13,8 @@ use utils::backoff::retry; pub fn app(state: Arc) -> Router<()> { use axum::routing::{delete as _delete, get as _get}; let delete_prefix = _delete(delete_prefix); + // NB: On any changes do not forget to update the OpenAPI spec + // in /endpoint_storage/src/openapi_spec.yml. Router::new() .route( "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", diff --git a/endpoint_storage/src/openapi_spec.yml b/endpoint_storage/src/openapi_spec.yml new file mode 100644 index 0000000000..8d9abf902c --- /dev/null +++ b/endpoint_storage/src/openapi_spec.yml @@ -0,0 +1,146 @@ +openapi: "3.0.2" +info: + title: Endpoint Storage API + description: Endpoint Storage API + version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE +servers: + - url: "" +paths: + /status: + description: Healthcheck endpoint + get: + description: Healthcheck + security: [] + responses: + "200": + description: OK + + /{tenant_id}/{timeline_id}/{endpoint_id}/{key}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + - name: key + in: path + required: true + schema: + type: string + get: + description: Get file from blob storage + responses: + "200": + description: "File stream from blob storage" + content: + application/octet-stream: + schema: + type: string + format: binary + "400": + description: File was not found + "403": + description: JWT does not authorize request to this route + put: + description: Insert file into blob storage. If file exists, override it + requestBody: + content: + application/octet-stream: + schema: + type: string + format: binary + responses: + "200": + description: File was inserted successfully + "403": + description: JWT does not authorize request to this route + delete: + description: Delete file from blob storage + responses: + "200": + description: File was successfully deleted or not found + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}/{endpoint_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + delete: + description: Delete endpoint data from blob storage + responses: + "200": + description: Endpoint data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + delete: + description: Delete timeline data from blob storage + responses: + "200": + description: Timeline data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + delete: + description: Delete tenant data from blob storage + responses: + "200": + description: Tenant data was deleted + "403": + description: JWT does not authorize request to this route + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + +security: + - JWT: [] diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 5cad849e3d..a54411b06a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -58,7 +58,7 @@ pub enum LfcPrewarmState { }, } -#[derive(Serialize, Default, Debug, Clone)] +#[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { #[default] diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 6b2caa9d3a..60311aa3e6 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -181,10 +181,14 @@ pub struct ComputeSpec { /// JWT for authorizing requests to endpoint storage service pub endpoint_storage_token: Option, - /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] + /// Download LFC state from endpoint storage and pass it to Postgres on compute startup pub autoprewarm: bool, + #[serde(default)] + /// Upload LFC state to endpoint storage periodically. Default value (None) means "don't upload" + pub offload_lfc_interval_seconds: Option, + /// Suspend timeout in seconds. /// /// We use this value to derive other values, such as the installed extensions metric. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 94d7f1e081..86ab8c6e32 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -90,6 +90,11 @@ "value": "off", "vartype": "bool" }, + { + "name": "offload_lfc_interval_seconds", + "value": "20", + "vartype": "integer" + }, { "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 9024a90a82..afb2ae8f47 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -41,17 +41,35 @@ pub fn get_query_param<'a>( Some(q) => q, None => return Ok(None), }; - let mut values = url::form_urlencoded::parse(query.as_bytes()) + let values = url::form_urlencoded::parse(query.as_bytes()) .filter_map(|(k, v)| if k == param_name { Some(v) } else { None }) // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards .fuse(); - let value1 = values.next(); - if values.next().is_some() { - return Err(ApiError::BadRequest(anyhow!( - "param {param_name} specified more than once" - ))); - } + // Work around an issue with Alloy's pyroscope scrape where the "seconds" + // parameter is added several times. https://github.com/grafana/alloy/issues/3026 + // TODO: revert after Alloy is fixed. + let value1 = values + .map(Ok) + .reduce(|acc, i| { + match acc { + Err(_) => acc, + + // It's okay to have duplicates as along as they have the same value. + Ok(ref a) if a == &i.unwrap() => acc, + + _ => Err(ApiError::BadRequest(anyhow!( + "param {param_name} specified more than once" + ))), + } + }) + .transpose()?; + // if values.next().is_some() { + // return Err(ApiError::BadRequest(anyhow!( + // "param {param_name} specified more than once" + // ))); + // } + Ok(value1) } @@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> None => Ok(()), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_query_param_duplicate() { + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert!(value.is_none()); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam"); + assert!(value.is_err()); + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 00d6b61399..dc7e9aed7f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -5,6 +5,7 @@ mod tests; use const_format::formatcp; use posthog_client_lite::PostHogClientConfig; +use utils::serde_percent::Percent; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; @@ -223,7 +224,7 @@ pub struct ConfigToml { pub metric_collection_bucket: Option, #[serde(with = "humantime_serde")] pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, #[serde(with = "humantime_serde")] @@ -273,6 +274,7 @@ pub struct ConfigToml { } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(default)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, @@ -283,6 +285,21 @@ pub struct DiskUsageEvictionTaskConfig { /// Select sorting for evicted layers #[serde(default)] pub eviction_order: EvictionOrder, + pub enabled: bool, +} + +impl Default for DiskUsageEvictionTaskConfig { + fn default() -> Self { + Self { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + #[cfg(feature = "testing")] + mock_statvfs: None, + eviction_order: EvictionOrder::default(), + enabled: true, + } + } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -738,7 +755,7 @@ impl Default for ConfigToml { metric_collection_bucket: (None), - disk_usage_based_eviction: (None), + disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a8080a57e9..a8c7083b17 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -386,6 +386,7 @@ pub enum NodeSchedulingPolicy { Pause, PauseForRestart, Draining, + Deleting, } impl FromStr for NodeSchedulingPolicy { @@ -398,6 +399,7 @@ impl FromStr for NodeSchedulingPolicy { "pause" => Ok(Self::Pause), "pause_for_restart" => Ok(Self::PauseForRestart), "draining" => Ok(Self::Draining), + "deleting" => Ok(Self::Deleting), _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), } } @@ -412,6 +414,7 @@ impl From for String { Pause => "pause", PauseForRestart => "pause_for_restart", Draining => "draining", + Deleting => "deleting", } .to_string() } @@ -420,6 +423,7 @@ impl From for String { #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum SkSchedulingPolicy { Active, + Activating, Pause, Decomissioned, } @@ -430,6 +434,7 @@ impl FromStr for SkSchedulingPolicy { fn from_str(s: &str) -> Result { Ok(match s { "active" => Self::Active, + "activating" => Self::Activating, "pause" => Self::Pause, "decomissioned" => Self::Decomissioned, _ => { @@ -446,6 +451,7 @@ impl From for String { use SkSchedulingPolicy::*; match value { Active => "active", + Activating => "activating", Pause => "pause", Decomissioned => "decomissioned", } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 091299f842..851d824291 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -78,7 +78,13 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - BrokenPipe | ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut + HostUnreachable + | NetworkUnreachable + | BrokenPipe + | ConnectionRefused + | ConnectionAborted + | ConnectionReset + | TimedOut, ) } diff --git a/libs/proxy/json/Cargo.toml b/libs/proxy/json/Cargo.toml new file mode 100644 index 0000000000..2f163c141d --- /dev/null +++ b/libs/proxy/json/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "json" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +ryu = "1" +itoa = "1" + +[dev-dependencies] +futures = "0.3" diff --git a/libs/proxy/json/src/lib.rs b/libs/proxy/json/src/lib.rs new file mode 100644 index 0000000000..a8b2e6b509 --- /dev/null +++ b/libs/proxy/json/src/lib.rs @@ -0,0 +1,412 @@ +//! A JSON serialization lib, designed for more flexibility than `serde_json` offers. +//! +//! Features: +//! +//! ## Dynamic construction +//! +//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc. +//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types. +//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct), +//! but that is often not very efficient. +//! +//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the +//! relevant functions, and it will guarantee a correctly encoded JSON value. +//! +//! ## Async construction +//! +//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory +//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however +//! there are exceptions. +//! +//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes, +//! whereas serializing values incrementally spreads out the CPU load and reduces lag. +//! +//! ## Examples +//! +//! To represent the following JSON as a compact string +//! +//! ```json +//! { +//! "results": { +//! "rows": [ +//! { +//! "id": 1, +//! "value": null +//! }, +//! { +//! "id": 2, +//! "value": "hello" +//! } +//! ] +//! } +//! } +//! ``` +//! +//! We can use the following code: +//! +//! ``` +//! // create the outer object +//! let s = json::value_to_string!(|v| json::value_as_object!(|v| { +//! // create an entry with key "results" and start an object value associated with it. +//! let results = v.key("results"); +//! json::value_as_object!(|results| { +//! // create an entry with key "rows" and start an list value associated with it. +//! let rows = results.key("rows"); +//! json::value_as_list!(|rows| { +//! // create a list entry and start an object value associated with it. +//! let row = rows.entry(); +//! json::value_as_object!(|row| { +//! // add entry "id": 1 +//! row.entry("id", 1); +//! // add entry "value": null +//! row.entry("value", json::Null); +//! }); +//! +//! // create a list entry and start an object value associated with it. +//! let row = rows.entry(); +//! json::value_as_object!(|row| { +//! // add entry "id": 2 +//! row.entry("id", 2); +//! // add entry "value": "hello" +//! row.entry("value", "hello"); +//! }); +//! }); +//! }); +//! })); +//! +//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#); +//! ``` + +mod macros; +mod str; +mod value; + +pub use value::{Null, ValueEncoder}; + +#[must_use] +/// Serialize a single json value. +pub struct ValueSer<'buf> { + buf: &'buf mut Vec, + start: usize, +} + +impl<'buf> ValueSer<'buf> { + /// Create a new json value serializer. + pub fn new(buf: &'buf mut Vec) -> Self { + Self { buf, start: 0 } + } + + /// Borrow the underlying buffer + pub fn as_buffer(&self) -> &[u8] { + self.buf + } + + #[inline] + pub fn value(self, e: impl ValueEncoder) { + e.encode(self); + } + + /// Write raw bytes to the buf. This must be already JSON encoded. + #[inline] + pub fn write_raw_json(self, data: &[u8]) { + self.buf.extend_from_slice(data); + self.finish(); + } + + /// Start a new object serializer. + #[inline] + pub fn object(self) -> ObjectSer<'buf> { + ObjectSer::new(self) + } + + /// Start a new list serializer. + #[inline] + pub fn list(self) -> ListSer<'buf> { + ListSer::new(self) + } + + /// Finish the value ser. + #[inline] + fn finish(self) { + // don't trigger the drop handler which triggers a rollback. + // this won't cause memory leaks because `ValueSet` owns no allocations. + std::mem::forget(self); + } +} + +impl Drop for ValueSer<'_> { + fn drop(&mut self) { + self.buf.truncate(self.start); + } +} + +#[must_use] +/// Serialize a json object. +pub struct ObjectSer<'buf> { + value: ValueSer<'buf>, + start: usize, +} + +impl<'buf> ObjectSer<'buf> { + /// Start a new object serializer. + #[inline] + pub fn new(value: ValueSer<'buf>) -> Self { + value.buf.push(b'{'); + let start = value.buf.len(); + Self { value, start } + } + + /// Borrow the underlying buffer + pub fn as_buffer(&self) -> &[u8] { + self.value.as_buffer() + } + + /// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value. + #[inline] + pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> { + key.write_key(self) + } + + /// Write an entry (key-value pair) to the object. + #[inline] + pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) { + self.key(key).value(val); + } + + #[inline] + fn entry_inner(&mut self, f: impl FnOnce(&mut Vec)) -> ValueSer<'_> { + // track before the separator so we the value is rolled back it also removes the separator. + let start = self.value.buf.len(); + + // push separator if necessary + if self.value.buf.len() > self.start { + self.value.buf.push(b','); + } + // push key + f(self.value.buf); + // push value separator + self.value.buf.push(b':'); + + // return value writer. + ValueSer { + buf: self.value.buf, + start, + } + } + + /// Reset the buffer back to before this object was started. + #[inline] + pub fn rollback(self) -> ValueSer<'buf> { + // Do not fully reset the value, only reset it to before the `{`. + // This ensures any `,` before this value are not clobbered. + self.value.buf.truncate(self.start - 1); + self.value + } + + /// Finish the object ser. + #[inline] + pub fn finish(self) { + self.value.buf.push(b'}'); + self.value.finish(); + } +} + +pub trait KeyEncoder { + fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>; +} + +#[must_use] +/// Serialize a json object. +pub struct ListSer<'buf> { + value: ValueSer<'buf>, + start: usize, +} + +impl<'buf> ListSer<'buf> { + /// Start a new list serializer. + #[inline] + pub fn new(value: ValueSer<'buf>) -> Self { + value.buf.push(b'['); + let start = value.buf.len(); + Self { value, start } + } + + /// Borrow the underlying buffer + pub fn as_buffer(&self) -> &[u8] { + self.value.as_buffer() + } + + /// Write an value to the list. + #[inline] + pub fn push(&mut self, val: impl ValueEncoder) { + self.entry().value(val); + } + + /// Start a new value entry in this list. + #[inline] + pub fn entry(&mut self) -> ValueSer<'_> { + // track before the separator so we the value is rolled back it also removes the separator. + let start = self.value.buf.len(); + + // push separator if necessary + if self.value.buf.len() > self.start { + self.value.buf.push(b','); + } + + // return value writer. + ValueSer { + buf: self.value.buf, + start, + } + } + + /// Reset the buffer back to before this object was started. + #[inline] + pub fn rollback(self) -> ValueSer<'buf> { + // Do not fully reset the value, only reset it to before the `[`. + // This ensures any `,` before this value are not clobbered. + self.value.buf.truncate(self.start - 1); + self.value + } + + /// Finish the object ser. + #[inline] + pub fn finish(self) { + self.value.buf.push(b']'); + self.value.finish(); + } +} + +#[cfg(test)] +mod tests { + use crate::{Null, ValueSer}; + + #[test] + fn object() { + let mut buf = vec![]; + let mut object = ValueSer::new(&mut buf).object(); + object.entry("foo", "bar"); + object.entry("baz", Null); + object.finish(); + + assert_eq!(buf, br#"{"foo":"bar","baz":null}"#); + } + + #[test] + fn list() { + let mut buf = vec![]; + let mut list = ValueSer::new(&mut buf).list(); + list.entry().value("bar"); + list.entry().value(Null); + list.finish(); + + assert_eq!(buf, br#"["bar",null]"#); + } + + #[test] + fn object_macro() { + let res = crate::value_to_string!(|obj| { + crate::value_as_object!(|obj| { + obj.entry("foo", "bar"); + obj.entry("baz", Null); + }) + }); + + assert_eq!(res, r#"{"foo":"bar","baz":null}"#); + } + + #[test] + fn list_macro() { + let res = crate::value_to_string!(|list| { + crate::value_as_list!(|list| { + list.entry().value("bar"); + list.entry().value(Null); + }) + }); + + assert_eq!(res, r#"["bar",null]"#); + } + + #[test] + fn rollback_on_drop() { + let res = crate::value_to_string!(|list| { + crate::value_as_list!(|list| { + list.entry().value("bar"); + + 'cancel: { + let nested_list = list.entry(); + crate::value_as_list!(|nested_list| { + nested_list.entry().value(1); + + assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#); + if true { + break 'cancel; + } + }) + } + + assert_eq!(list.as_buffer(), br#"["bar""#); + + list.entry().value(Null); + }) + }); + + assert_eq!(res, r#"["bar",null]"#); + } + + #[test] + fn rollback_object() { + let res = crate::value_to_string!(|obj| { + crate::value_as_object!(|obj| { + let entry = obj.key("1"); + entry.value(1_i32); + + let entry = obj.key("2"); + let entry = { + let mut nested_obj = entry.object(); + nested_obj.entry("foo", "bar"); + nested_obj.rollback() + }; + + entry.value(2_i32); + }) + }); + + assert_eq!(res, r#"{"1":1,"2":2}"#); + } + + #[test] + fn rollback_list() { + let res = crate::value_to_string!(|list| { + crate::value_as_list!(|list| { + let entry = list.entry(); + entry.value(1_i32); + + let entry = list.entry(); + let entry = { + let mut nested_list = entry.list(); + nested_list.push("foo"); + nested_list.rollback() + }; + + entry.value(2_i32); + }) + }); + + assert_eq!(res, r#"[1,2]"#); + } + + #[test] + fn string_escaping() { + let mut buf = vec![]; + let mut object = ValueSer::new(&mut buf).object(); + + let key = "hello"; + let value = "\n world"; + + object.entry(format_args!("{key:?}"), value); + object.finish(); + + assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#); + } +} diff --git a/libs/proxy/json/src/macros.rs b/libs/proxy/json/src/macros.rs new file mode 100644 index 0000000000..d3b5cfed10 --- /dev/null +++ b/libs/proxy/json/src/macros.rs @@ -0,0 +1,86 @@ +//! # Examples +//! +//! ``` +//! use futures::{StreamExt, TryStream, TryStreamExt}; +//! +//! async fn stream_to_json_list(mut s: S) -> Result +//! where +//! S: TryStream + Unpin, +//! T: json::ValueEncoder +//! { +//! Ok(json::value_to_string!(|val| json::value_as_list!(|val| { +//! // note how we can use `.await` and `?` in here. +//! while let Some(value) = s.try_next().await? { +//! val.push(value); +//! } +//! }))) +//! } +//! +//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::); +//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap(); +//! assert_eq!(json_string, "[1,2,3]"); +//! ``` + +/// A helper to create a new JSON vec. +/// +/// Implemented as a macro to preserve all control flow. +#[macro_export] +macro_rules! value_to_vec { + (|$val:ident| $body:expr) => {{ + let mut buf = vec![]; + let $val = $crate::ValueSer::new(&mut buf); + let _: () = $body; + buf + }}; +} + +/// A helper to create a new JSON string. +/// +/// Implemented as a macro to preserve all control flow. +#[macro_export] +macro_rules! value_to_string { + (|$val:ident| $body:expr) => {{ + ::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body)) + .expect("json should be valid utf8") + }}; +} + +/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion. +/// +/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer. +/// The serializer is only 'finished' if the body completes. +/// The serializer is rolled back if `break`/`return` escapes the body. +/// +/// Implemented as a macro to preserve all control flow. +#[macro_export] +macro_rules! value_as_object { + (|$val:ident| $body:expr) => {{ + let mut obj = $crate::ObjectSer::new($val); + + let $val = &mut obj; + let res = $body; + + obj.finish(); + res + }}; +} + +/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion. +/// +/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer. +/// The serializer is only 'finished' if the body completes. +/// The serializer is rolled back if `break`/`return` escapes the body. +/// +/// Implemented as a macro to preserve all control flow. +#[macro_export] +macro_rules! value_as_list { + (|$val:ident| $body:expr) => {{ + let mut list = $crate::ListSer::new($val); + + let $val = &mut list; + let res = $body; + + list.finish(); + res + }}; +} diff --git a/libs/proxy/json/src/str.rs b/libs/proxy/json/src/str.rs new file mode 100644 index 0000000000..b092fd50ec --- /dev/null +++ b/libs/proxy/json/src/str.rs @@ -0,0 +1,166 @@ +//! Helpers for serializing escaped strings. +//! +//! ## License +//! +//! +//! +//! Licensed by David Tolnay under MIT or Apache-2.0. +//! +//! With modifications by Conrad Ludgate on behalf of Databricks. + +use std::fmt::{self, Write}; + +/// Represents a character escape code in a type-safe manner. +pub enum CharEscape { + /// An escaped quote `"` + Quote, + /// An escaped reverse solidus `\` + ReverseSolidus, + // /// An escaped solidus `/` + // Solidus, + /// An escaped backspace character (usually escaped as `\b`) + Backspace, + /// An escaped form feed character (usually escaped as `\f`) + FormFeed, + /// An escaped line feed character (usually escaped as `\n`) + LineFeed, + /// An escaped carriage return character (usually escaped as `\r`) + CarriageReturn, + /// An escaped tab character (usually escaped as `\t`) + Tab, + /// An escaped ASCII plane control character (usually escaped as + /// `\u00XX` where `XX` are two hex characters) + AsciiControl(u8), +} + +impl CharEscape { + #[inline] + fn from_escape_table(escape: u8, byte: u8) -> CharEscape { + match escape { + self::BB => CharEscape::Backspace, + self::TT => CharEscape::Tab, + self::NN => CharEscape::LineFeed, + self::FF => CharEscape::FormFeed, + self::RR => CharEscape::CarriageReturn, + self::QU => CharEscape::Quote, + self::BS => CharEscape::ReverseSolidus, + self::UU => CharEscape::AsciiControl(byte), + _ => unreachable!(), + } + } +} + +pub(crate) fn format_escaped_str(writer: &mut Vec, value: &str) { + writer.reserve(2 + value.len()); + + writer.push(b'"'); + + let rest = format_escaped_str_contents(writer, value); + writer.extend_from_slice(rest); + + writer.push(b'"'); +} + +pub(crate) fn format_escaped_fmt(writer: &mut Vec, args: fmt::Arguments) { + writer.push(b'"'); + + Collect { buf: writer } + .write_fmt(args) + .expect("formatting should not error"); + + writer.push(b'"'); +} + +struct Collect<'buf> { + buf: &'buf mut Vec, +} + +impl fmt::Write for Collect<'_> { + fn write_str(&mut self, s: &str) -> fmt::Result { + let last = format_escaped_str_contents(self.buf, s); + self.buf.extend(last); + Ok(()) + } +} + +// writes any escape sequences, and returns the suffix still needed to be written. +fn format_escaped_str_contents<'a>(writer: &mut Vec, value: &'a str) -> &'a [u8] { + let bytes = value.as_bytes(); + + let mut start = 0; + + for (i, &byte) in bytes.iter().enumerate() { + let escape = ESCAPE[byte as usize]; + if escape == 0 { + continue; + } + + writer.extend_from_slice(&bytes[start..i]); + + let char_escape = CharEscape::from_escape_table(escape, byte); + write_char_escape(writer, char_escape); + + start = i + 1; + } + + &bytes[start..] +} + +const BB: u8 = b'b'; // \x08 +const TT: u8 = b't'; // \x09 +const NN: u8 = b'n'; // \x0A +const FF: u8 = b'f'; // \x0C +const RR: u8 = b'r'; // \x0D +const QU: u8 = b'"'; // \x22 +const BS: u8 = b'\\'; // \x5C +const UU: u8 = b'u'; // \x00...\x1F except the ones above +const __: u8 = 0; + +// Lookup table of escape sequences. A value of b'x' at index i means that byte +// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped. +static ESCAPE: [u8; 256] = [ + // 1 2 3 4 5 6 7 8 9 A B C D E F + UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0 + UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1 + __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F +]; + +fn write_char_escape(writer: &mut Vec, char_escape: CharEscape) { + let s = match char_escape { + CharEscape::Quote => b"\\\"", + CharEscape::ReverseSolidus => b"\\\\", + // CharEscape::Solidus => b"\\/", + CharEscape::Backspace => b"\\b", + CharEscape::FormFeed => b"\\f", + CharEscape::LineFeed => b"\\n", + CharEscape::CarriageReturn => b"\\r", + CharEscape::Tab => b"\\t", + CharEscape::AsciiControl(byte) => { + static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; + let bytes = &[ + b'\\', + b'u', + b'0', + b'0', + HEX_DIGITS[(byte >> 4) as usize], + HEX_DIGITS[(byte & 0xF) as usize], + ]; + return writer.extend_from_slice(bytes); + } + }; + + writer.extend_from_slice(s); +} diff --git a/libs/proxy/json/src/value.rs b/libs/proxy/json/src/value.rs new file mode 100644 index 0000000000..705af9603e --- /dev/null +++ b/libs/proxy/json/src/value.rs @@ -0,0 +1,168 @@ +use core::fmt; +use std::collections::{BTreeMap, HashMap}; + +use crate::str::{format_escaped_fmt, format_escaped_str}; +use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object}; + +/// Write a value to the underlying json representation. +pub trait ValueEncoder { + fn encode(self, v: ValueSer<'_>); +} + +pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec) { + b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes()); +} + +pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec) { + b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes()); +} + +impl ValueEncoder for &T { + #[inline] + fn encode(self, v: ValueSer<'_>) { + T::encode(*self, v); + } +} + +impl ValueEncoder for &str { + #[inline] + fn encode(self, v: ValueSer<'_>) { + format_escaped_str(v.buf, self); + v.finish(); + } +} + +impl ValueEncoder for fmt::Arguments<'_> { + #[inline] + fn encode(self, v: ValueSer<'_>) { + if let Some(s) = self.as_str() { + format_escaped_str(v.buf, s); + } else { + format_escaped_fmt(v.buf, self); + } + v.finish(); + } +} + +macro_rules! int { + [$($t:ty),*] => { + $( + impl ValueEncoder for $t { + #[inline] + fn encode(self, v: ValueSer<'_>) { + write_int(self, v.buf); + v.finish(); + } + } + )* + }; +} + +int![u8, u16, u32, u64, usize, u128]; +int![i8, i16, i32, i64, isize, i128]; + +macro_rules! float { + [$($t:ty),*] => { + $( + impl ValueEncoder for $t { + #[inline] + fn encode(self, v: ValueSer<'_>) { + write_float(self, v.buf); + v.finish(); + } + } + )* + }; +} + +float![f32, f64]; + +impl ValueEncoder for bool { + #[inline] + fn encode(self, v: ValueSer<'_>) { + v.write_raw_json(if self { b"true" } else { b"false" }); + } +} + +impl ValueEncoder for Option { + #[inline] + fn encode(self, v: ValueSer<'_>) { + match self { + Some(value) => value.encode(v), + None => Null.encode(v), + } + } +} + +impl KeyEncoder for &str { + #[inline] + fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> { + let obj = &mut *obj; + obj.entry_inner(|b| format_escaped_str(b, self)) + } +} + +impl KeyEncoder for fmt::Arguments<'_> { + #[inline] + fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> { + if let Some(key) = self.as_str() { + obj.entry_inner(|b| format_escaped_str(b, key)) + } else { + obj.entry_inner(|b| format_escaped_fmt(b, self)) + } + } +} + +/// Represents the JSON null value. +pub struct Null; + +impl ValueEncoder for Null { + #[inline] + fn encode(self, v: ValueSer<'_>) { + v.write_raw_json(b"null"); + } +} + +impl ValueEncoder for Vec { + #[inline] + fn encode(self, v: ValueSer<'_>) { + value_as_list!(|v| { + for t in self { + v.entry().value(t); + } + }); + } +} + +impl ValueEncoder for &[T] { + #[inline] + fn encode(self, v: ValueSer<'_>) { + value_as_list!(|v| { + for t in self { + v.entry().value(t); + } + }); + } +} + +impl ValueEncoder for HashMap { + #[inline] + fn encode(self, o: ValueSer<'_>) { + value_as_object!(|o| { + for (k, v) in self { + o.entry(k, v); + } + }); + } +} + +impl ValueEncoder for BTreeMap { + #[inline] + fn encode(self, o: ValueSer<'_>) { + value_as_object!(|o| { + for (k, v) in self { + o.entry(k, v); + } + }); + } +} diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index b8304f9d8d..274c81c500 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -52,7 +52,7 @@ pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { } // yield every ~250us // hopefully reduces tail latencies - if i % 1024 == 0 { + if i.is_multiple_of(1024) { yield_now().await } } diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 41b22e35b6..828884ffd8 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -90,7 +90,7 @@ pub struct InnerClient { } impl InnerClient { - pub fn start(&mut self) -> Result { + pub fn start(&mut self) -> Result, Error> { self.responses.waiting += 1; Ok(PartialQuery(Some(self))) } @@ -227,7 +227,7 @@ impl Client { &mut self, statement: &str, params: I, - ) -> Result + ) -> Result, Error> where S: AsRef, I: IntoIterator>, @@ -262,7 +262,7 @@ impl Client { pub(crate) async fn simple_query_raw( &mut self, query: &str, - ) -> Result { + ) -> Result, Error> { simple_query::simple_query(self.inner_mut(), query).await } diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index eeefb45d26..4c5fc623c5 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -12,7 +12,11 @@ mod private { /// This trait is "sealed", and cannot be implemented outside of this crate. pub trait GenericClient: private::Sealed { /// Like `Client::query_raw_txt`. - async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result + async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -22,7 +26,11 @@ pub trait GenericClient: private::Sealed { impl private::Sealed for Client {} impl GenericClient for Client { - async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result + async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, @@ -35,7 +43,11 @@ impl GenericClient for Client { impl private::Sealed for Transaction<'_> {} impl GenericClient for Transaction<'_> { - async fn query_raw_txt(&mut self, statement: &str, params: I) -> Result + async fn query_raw_txt( + &mut self, + statement: &str, + params: I, + ) -> Result, Error> where S: AsRef + Sync + Send, I: IntoIterator> + Sync + Send, diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index 12fe0737d4..0e37d2aad7 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -47,7 +47,7 @@ impl<'a> Transaction<'a> { &mut self, statement: &str, params: I, - ) -> Result + ) -> Result, Error> where S: AsRef, I: IntoIterator>, diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index bd18d80915..69316fd493 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,6 +13,7 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true +base64.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true @@ -41,6 +42,8 @@ http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } +byteorder = "1.4" + [dev-dependencies] camino-tempfile.workspace = true test-context.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index e9c24ac723..db30829216 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; -use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::blob::BlobBlockType; +use azure_storage_blobs::blob::BlockList; use azure_storage_blobs::blob::{Blob, CopyStatus}; use azure_storage_blobs::container::operations::ListBlobsBuilder; -use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; +use azure_storage_blobs::prelude::ClientBuilder; +use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use base64::{Engine as _, engine::general_purpose::URL_SAFE}; +use byteorder::{BigEndian, ByteOrder}; use bytes::Bytes; +use camino::Utf8Path; use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; +use tokio::fs::File; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; @@ -51,6 +59,9 @@ pub struct AzureBlobStorage { // Alternative timeout used for metadata objects which are expected to be small pub small_timeout: Duration, + /* BEGIN_HADRON */ + pub put_block_size_mb: Option, + /* END_HADRON */ } impl AzureBlobStorage { @@ -107,6 +118,9 @@ impl AzureBlobStorage { concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, small_timeout, + /* BEGIN_HADRON */ + put_block_size_mb: azure_config.put_block_size_mb, + /* END_HADRON */ }) } @@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage { let started_at = start_measuring_requests(kind); - let op = async { + let mut metadata_map = metadata.unwrap_or([].into()); + let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block"); + + /* BEGIN_HADRON */ + let op = async move { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024; + if timeline_file_path.is_none() || put_block_size == 0 { + // Use put_block_blob directly. + let from: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(from); + let from = NonSeekableStream::new(from, data_size_bytes); + let body = azure_core::Body::SeekableStream(Box::new(from)); - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let mut builder = blob_client.put_block_blob(body); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + match result { + Ok(Ok(_response)) => return Ok(()), + Ok(Err(azure)) => return Err(azure.into()), + Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()), + }; + } + // Upload chunks concurrently using Put Block. + // Each PutBlock uploads put_block_size bytes of the file. + let mut upload_futures: Vec>> = + vec![]; + let mut block_list = BlockList::default(); + let mut start_bytes = 0u64; + let mut remaining_bytes = data_size_bytes; + let mut block_list_count = 0; - let from = NonSeekableStream::new(from, data_size_bytes); + while remaining_bytes > 0 { + let block_size = std::cmp::min(remaining_bytes, put_block_size); + let end_bytes = start_bytes + block_size as u64; + let block_id = block_list_count; + let timeout = self.timeout; + let blob_client = blob_client.clone(); + let timeline_file = timeline_file_path.clone().unwrap().clone(); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let mut encoded_block_id = [0u8; 8]; + BigEndian::write_u64(&mut encoded_block_id, block_id); + URL_SAFE.encode(encoded_block_id); - let mut builder = blob_client.put_block_blob(body); + // Put one block. + let part_fut = async move { + let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?; + file.seek(io::SeekFrom::Start(start_bytes)).await?; + let limited_reader = file.take(block_size as u64); + let file_chunk_stream = + tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024); + let file_chunk_stream_pin: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(file_chunk_stream); + let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size); + let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper)); + // Azure put block takes URL-encoded block ids and all blocks must have the same byte length. + // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters + let builder = blob_client.put_block(encoded_block_id.to_vec(), body); + let fut = builder.into_future(); + let fut = tokio::time::timeout(timeout, fut); + let result = fut.await; + tracing::debug!( + "azure put block id-{} size {} start {} end {} file {} response {:#?}", + block_id, + block_size, + start_bytes, + end_bytes, + timeline_file, + result + ); + match result { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure), + Err(_timeout) => Err(azure_core::Error::new( + azure_core::error::ErrorKind::Io, + std::io::Error::new( + std::io::ErrorKind::TimedOut, + "Operation timed out", + ), + )), + } + }; + upload_futures.push(tokio::spawn(part_fut)); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); + block_list_count += 1; + remaining_bytes -= block_size; + start_bytes += block_size as u64; + + block_list + .blocks + .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into())); } + tracing::debug!( + "azure put blocks {} total MB: {} chunk size MB: {}", + block_list_count, + data_size_bytes / 1024 / 1024, + put_block_size / 1024 / 1024 + ); + // Wait for all blocks to be uploaded. + let upload_results = futures::future::try_join_all(upload_futures).await; + if upload_results.is_err() { + return Err(anyhow::anyhow!(format!( + "Failed to upload all blocks {:#?}", + upload_results.unwrap_err() + ))); + } + + // Commit the blocks. + let mut builder = blob_client.put_block_list(block_list); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } let fut = builder.into_future(); let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + tracing::debug!("azure put block list response {:#?}", result); - match fut.await { + match result { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } }; + /* END_HADRON */ let res = tokio::select! { res = op => res, @@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage { crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, outcome, started_at); - res } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 5bc1f678ae..e13e17d544 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -195,8 +195,19 @@ pub struct AzureConfig { pub max_keys_per_list_response: Option, #[serde(default = "default_azure_conn_pool_size")] pub conn_pool_size: usize, + /* BEGIN_HADRON */ + #[serde(default = "default_azure_put_block_size_mb")] + pub put_block_size_mb: Option, + /* END_HADRON */ } +/* BEGIN_HADRON */ +fn default_azure_put_block_size_mb() -> Option { + // Disable parallel upload by default. + Some(0) +} +/* END_HADRON */ + fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() } @@ -213,6 +224,9 @@ impl Debug for AzureConfig { "max_keys_per_list_response", &self.max_keys_per_list_response, ) + /* BEGIN_HADRON */ + .field("put_block_size_mb", &self.put_block_size_mb) + /* END_HADRON */ .finish() } } @@ -352,6 +366,7 @@ timeout = '5s'"; upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' conn_pool_size = 8 + put_block_size_mb = 1024 "; let config = parse(toml).unwrap(); @@ -367,6 +382,9 @@ timeout = '5s'"; concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1024), + /* END_HADRON */ }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index daab05d91a..fb7d6fd482 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); + + /* BEGIN_HADRON */ + let mut metadata = None; + if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) { + let file_path = "/tmp/dbx_upload_tmp_file.txt"; + { + // Open the file in append mode + let mut file = std::fs::OpenOptions::new() + .append(true) + .create(true) // Create the file if it doesn't exist + .open(file_path)?; + // Append some bytes to the file + std::io::Write::write_all( + &mut file, + &format!("remote blob data {i}").into_bytes(), + )?; + file.sync_all()?; + } + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + file_path, + )])); + } + /* END_HADRON */ + task_client - .upload(data, data_len, &blob_path, None, &cancel) + .upload(data, data_len, &blob_path, metadata, &cancel) .await?; + // TODO: Check upload is using the put_block upload. + // We cannot consume data here since data is moved inside the upload. + // let total_bytes = data.fold(0, |acc, chunk| async move { + // acc + chunk.map(|bytes| bytes.len()).unwrap_or(0) + // }).await; + // assert_eq!(total_bytes, data_len); + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 31c9ca3200..4d7caabd39 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -219,6 +219,9 @@ async fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1), + /* END_HADRON */ }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 7042e42e66..013aa55a86 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -184,7 +184,7 @@ pub struct TimelineMembershipSwitchRequest { pub struct TimelineMembershipSwitchResponse { pub previous_conf: Configuration, pub current_conf: Configuration, - pub term: Term, + pub last_log_term: Term, pub flush_lsn: Lsn, } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 0ac8201795..5828a400a0 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -24,12 +24,28 @@ macro_rules! critical { if cfg!(debug_assertions) { panic!($($arg)*); } + // Increment both metrics $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); }}; } +#[macro_export] +macro_rules! critical_timeline { + ($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{ + if cfg!(debug_assertions) { + panic!($($arg)*); + } + // Increment both metrics + $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); + $crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string()); + let backtrace = std::backtrace::Backtrace::capture(); + tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}", + $tenant_shard_id, $timeline_id, format!($($arg)*)); + }}; +} + #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { @@ -61,6 +77,36 @@ pub struct TracingEventCountMetric { trace: IntCounter, } +// Begin Hadron: Add a HadronCriticalStorageEventCountMetric metric that is sliced by tenant_id and timeline_id +pub struct HadronCriticalStorageEventCountMetric { + critical: IntCounterVec, +} + +pub static HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC: Lazy = + Lazy::new(|| { + let vec = metrics::register_int_counter_vec!( + "hadron_critical_storage_event_count", + "Number of critical storage events, by tenant_id and timeline_id", + &["tenant_shard_id", "timeline_id"] + ) + .expect("failed to define metric"); + HadronCriticalStorageEventCountMetric::new(vec) + }); + +impl HadronCriticalStorageEventCountMetric { + fn new(vec: IntCounterVec) -> Self { + Self { critical: vec } + } + + // Allow public access from `critical!` macro. + pub fn inc(&self, tenant_shard_id: &str, timeline_id: &str) { + self.critical + .with_label_values(&[tenant_shard_id, timeline_id]) + .inc(); + } +} +// End Hadron + pub static TRACING_EVENT_COUNT_METRIC: Lazy = Lazy::new(|| { let vec = metrics::register_int_counter_vec!( "libmetrics_tracing_event_count", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8a2e2ed3be..1fd0dccff0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -112,6 +112,7 @@ twox-hash.workspace = true procfs.workspace = true [dev-dependencies] +base64.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml new file mode 100644 index 0000000000..5a3a2761c2 --- /dev/null +++ b/pageserver/client_grpc/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pageserver_client_grpc" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +futures.workspace = true +pageserver_page_api.workspace = true +tokio.workspace = true +tokio-stream.workspace = true +tonic.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs new file mode 100644 index 0000000000..c900e1a939 --- /dev/null +++ b/pageserver/client_grpc/src/lib.rs @@ -0,0 +1,14 @@ +//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client` +//! gRPC client, and supports: +//! +//! * Sharded tenants across multiple Pageservers. +//! * Pooling of connections, clients, and streams for efficient resource use. +//! * Concurrent use by many callers. +//! * Internal handling of GetPage bidirectional streams. +//! * Automatic retries. +//! * Observability. +//! +//! The client is under development, this package is just a shell. + +#[allow(unused)] +mod pool; diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs new file mode 100644 index 0000000000..518e4e5b84 --- /dev/null +++ b/pageserver/client_grpc/src/pool.rs @@ -0,0 +1,586 @@ +//! This module provides various Pageserver gRPC client resource pools. +//! +//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across +//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency +//! of creating dedicated TCP connections and server tasks for every Postgres backend. +//! +//! Each resource has its own, nested pool. The pools are custom-built for the properties of each +//! resource -- they are different enough that a generic pool isn't suitable. +//! +//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients +//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a +//! per-channel client limit. Channels may be closed when they are no longer used by any clients. +//! +//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared) +//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a +//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed +//! from the pool after some time, to free up the channel. +//! +//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the +//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it +//! returns a guard that can be used to send a single request, to properly enforce queue depth and +//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request, +//! possibly pipelining multiple requests from multiple callers on the same stream (up to some +//! queue depth). Idle streams may be removed from the pool after a while to free up the client. +//! +//! Each channel corresponds to one TCP connection. Each client unary request and each stream +//! corresponds to one HTTP/2 stream and server task. +//! +//! TODO: error handling (including custom error types). +//! TODO: observability. + +use std::collections::{BTreeMap, HashMap}; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, Weak}; + +use futures::StreamExt as _; +use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use tonic::transport::{Channel, Endpoint}; +use tracing::{error, warn}; + +use pageserver_page_api as page_api; +use utils::id::{TenantId, TimelineId}; +use utils::shard::ShardIndex; + +/// Max number of concurrent clients per channel. +/// +/// TODO: tune these constants, and make them configurable. +/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels +/// with only streams. +const CLIENTS_PER_CHANNEL: usize = 16; + +/// Maximum number of concurrent clients per `ClientPool`. +const CLIENT_LIMIT: usize = 64; + +/// Max number of pipelined requests per gRPC GetPage stream. +const STREAM_QUEUE_DEPTH: usize = 2; + +/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 +/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of +/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle channels. +/// TODO: consider prewarming a set of channels, to avoid initial connection latency. +/// TODO: consider adding a circuit breaker for errors and fail fast. +pub struct ChannelPool { + /// Pageserver endpoint to connect to. + endpoint: Endpoint, + /// Open channels. + channels: Mutex>, + /// Channel ID generator. + next_channel_id: AtomicUsize, +} + +type ChannelID = usize; + +struct ChannelEntry { + /// The gRPC channel (i.e. TCP connection). Shared by multiple clients. + channel: Channel, + /// Number of clients using this channel. + clients: usize, +} + +impl ChannelPool { + /// Creates a new channel pool for the given Pageserver endpoint. + pub fn new(endpoint: E) -> anyhow::Result> + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + Ok(Arc::new(Self { + endpoint: endpoint.try_into()?, + channels: Mutex::default(), + next_channel_id: AtomicUsize::default(), + })) + } + + /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel. + /// + /// This never blocks (except for mutex acquisition). The channel is connected lazily on first + /// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established + /// automatically on failure (TODO: verify). + /// + /// Callers should not clone the returned channel, and must hold onto the returned guard as long + /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf + /// client requires an owned `Channel` and we don't have access to the channel's internal + /// refcount. + /// + /// This is not performance-sensitive. It is only called when creating a new client, and clients + /// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n) + /// performance is therefore okay. + pub fn get(self: &Arc) -> ChannelGuard { + let mut channels = self.channels.lock().unwrap(); + + // Try to find an existing channel with available capacity. We check entries in BTreeMap + // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients + // with lower-ordered channel IDs first. This will cluster clients in lower-ordered + // channels, and free up higher-ordered channels such that they can be reaped. + for (&id, entry) in channels.iter_mut() { + assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow"); + if entry.clients < CLIENTS_PER_CHANNEL { + entry.clients += 1; + return ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(entry.channel.clone()), + }; + } + } + + // Create a new channel. We connect lazily on first use, such that we don't block here and + // other clients can join onto the same channel while it's connecting. + let channel = self.endpoint.connect_lazy(); + + let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed); + let entry = ChannelEntry { + channel: channel.clone(), + clients: 1, // account for the guard below + }; + channels.insert(id, entry); + + ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(channel), + } + } +} + +/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`, +/// since the gRPC client requires an owned `Channel`. +pub struct ChannelGuard { + pool: Weak, + id: ChannelID, + channel: Option, +} + +impl ChannelGuard { + /// Returns the inner owned channel. Panics if called more than once. The caller must hold onto + /// the guard as long as the channel is in use, and should not clone it. + pub fn take(&mut self) -> Channel { + self.channel.take().expect("channel already taken") + } +} + +/// Returns the channel to the pool. +impl Drop for ChannelGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let mut channels = pool.channels.lock().unwrap(); + let entry = channels.get_mut(&self.id).expect("unknown channel"); + assert!(entry.clients > 0, "channel underflow"); + entry.clients -= 1; + } +} + +/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner +/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total +/// number of concurrent clients to `CLIENT_LIMIT` via semaphore. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle clients. +pub struct ClientPool { + /// Tenant ID. + tenant_id: TenantId, + /// Timeline ID. + timeline_id: TimelineId, + /// Shard ID. + shard_id: ShardIndex, + /// Authentication token, if any. + auth_token: Option, + /// Channel pool to acquire channels from. + channel_pool: Arc, + /// Limits the max number of concurrent clients for this pool. + limiter: Arc, + /// Idle pooled clients. Acquired clients are removed from here and returned on drop. + /// + /// The first client in the map will be acquired next. The map is sorted by client ID, which in + /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from + /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle + /// clients are reaped. + idle: Mutex>, + /// Unique client ID generator. + next_client_id: AtomicUsize, +} + +type ClientID = (ChannelID, usize); + +struct ClientEntry { + /// The pooled gRPC client. + client: page_api::Client, + /// The channel guard for the channel used by the client. + channel_guard: ChannelGuard, +} + +impl ClientPool { + /// Creates a new client pool for the given tenant shard. Channels are acquired from the given + /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. + pub fn new( + channel_pool: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + ) -> Arc { + Arc::new(Self { + tenant_id, + timeline_id, + shard_id, + auth_token, + channel_pool, + idle: Mutex::default(), + limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)), + next_client_id: AtomicUsize::default(), + }) + } + + /// Gets a client from the pool, or creates a new one if necessary. Connections are established + /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The + /// client is returned to the pool when the guard is dropped. + /// + /// This is moderately performance-sensitive. It is called for every unary request, but these + /// establish a new gRPC stream per request so they're already expensive. GetPage requests use + /// the `StreamPool` instead. + pub async fn get(self: &Arc) -> anyhow::Result { + let permit = self + .limiter + .clone() + .acquire_owned() + .await + .expect("never closed"); + + // Fast path: acquire an idle client from the pool. + if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() { + return Ok(ClientGuard { + pool: Arc::downgrade(self), + id, + client: Some(entry.client), + channel_guard: Some(entry.channel_guard), + permit, + }); + } + + // Slow path: construct a new client. + let mut channel_guard = self.channel_pool.get(); + let client = page_api::Client::new( + channel_guard.take(), + self.tenant_id, + self.timeline_id, + self.shard_id, + self.auth_token.clone(), + None, + )?; + + Ok(ClientGuard { + pool: Arc::downgrade(self), + id: ( + channel_guard.id, + self.next_client_id.fetch_add(1, Ordering::Relaxed), + ), + client: Some(client), + channel_guard: Some(channel_guard), + permit, + }) + } +} + +/// A client acquired from the pool. The inner client can be accessed via Deref. The client is +/// returned to the pool when dropped. +pub struct ClientGuard { + pool: Weak, + id: ClientID, + client: Option, // Some until dropped + channel_guard: Option, // Some until dropped + permit: OwnedSemaphorePermit, +} + +impl Deref for ClientGuard { + type Target = page_api::Client; + + fn deref(&self) -> &Self::Target { + self.client.as_ref().expect("not dropped") + } +} + +impl DerefMut for ClientGuard { + fn deref_mut(&mut self) -> &mut Self::Target { + self.client.as_mut().expect("not dropped") + } +} + +/// Returns the client to the pool. +impl Drop for ClientGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let entry = ClientEntry { + client: self.client.take().expect("dropped once"), + channel_guard: self.channel_guard.take().expect("dropped once"), + }; + pool.idle.lock().unwrap().insert(self.id, entry); + + _ = self.permit; // returned on drop, referenced for visibility + } +} + +/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream +/// acquires a client from the inner `ClientPool` for the stream's lifetime. +/// +/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send +/// a single request and await the response. Internally, requests are multiplexed across streams and +/// channels. This allows proper queue depth enforcement and response routing. +/// +/// TODO: reap idle streams. +/// TODO: consider making this generic over request and response types; not currently needed. +pub struct StreamPool { + /// The client pool to acquire clients from. + client_pool: Arc, + /// All pooled streams. + /// + /// Incoming requests will be sent over an existing stream with available capacity. If all + /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit). + /// Each stream has an associated Tokio task that processes requests and responses. + streams: Arc>>, + /// Limits the max number of concurrent requests (not streams). + limiter: Arc, + /// Stream ID generator. + next_stream_id: AtomicUsize, +} + +type StreamID = usize; +type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>; +type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>; +type ResponseSender = oneshot::Sender>; + +struct StreamEntry { + /// Sends caller requests to the stream task. The stream task exits when this is dropped. + sender: RequestSender, + /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on + /// completion without acquiring the `StreamPool::streams` lock. + queue_depth: Arc, +} + +impl StreamPool { + /// Creates a new stream pool, using the given client pool. + /// + /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may + /// fill up the client pool and starve out unary requests. Client pools can share the same + /// `ChannelPool` though, since the channel pool is unbounded. + pub fn new(client_pool: Arc) -> Arc { + Arc::new(Self { + client_pool, + streams: Arc::default(), + limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)), + next_stream_id: AtomicUsize::default(), + }) + } + + /// Acquires an available stream from the pool, or spins up a new stream async if all streams + /// are full. Returns a guard that can be used to send a single request on the stream and await + /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity + /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). + /// + /// This is very performance-sensitive, as it is on the GetPage hot path. + /// + /// TODO: this must do something more sophisticated for performance. We want: + /// + /// * Cheap, concurrent access in the common case where we can use a pooled stream. + /// * Quick acquisition of pooled streams with available capacity. + /// * Prefer streams that belong to lower-numbered channels, to reap idle channels. + /// * Prefer filling up existing streams' queue depth before spinning up new streams. + /// * Don't hold a lock while spinning up new streams. + /// * Allow concurrent clients to join onto streams while they're spun up. + /// * Allow spinning up multiple streams concurrently, but don't overshoot limits. + /// + /// For now, we just do something simple and functional, but very inefficient (linear scan). + pub async fn get(&self) -> StreamGuard { + let permit = self + .limiter + .clone() + .acquire_owned() + .await + .expect("never closed"); + let mut streams = self.streams.lock().unwrap(); + + // Look for a pooled stream with available capacity. + for entry in streams.values() { + assert!( + entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH, + "stream queue overflow" + ); + if entry + .queue_depth + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { + // Increment the queue depth via compare-and-swap. + // TODO: review ordering. + (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1) + }) + .is_ok() + { + return StreamGuard { + sender: entry.sender.clone(), + queue_depth: entry.queue_depth.clone(), + permit, + }; + } + } + + // No available stream, spin up a new one. We install the stream entry in the pool first and + // return the guard, while spinning up the stream task async. This allows other callers to + // join onto this stream and also create additional streams concurrently if this fills up. + let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); + let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller + let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); + let entry = StreamEntry { + sender: req_tx.clone(), + queue_depth: queue_depth.clone(), + }; + streams.insert(id, entry); + + // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT * + // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more + // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen + // because we only acquire queue depth under lock and after acquiring a semaphore permit. + assert!(streams.len() <= CLIENT_LIMIT, "stream overflow"); + + let client_pool = self.client_pool.clone(); + let streams = self.streams.clone(); + + tokio::spawn(async move { + if let Err(err) = Self::run_stream(client_pool, req_rx).await { + error!("stream failed: {err}"); + } + // Remove stream from pool on exit. + let entry = streams.lock().unwrap().remove(&id); + assert!(entry.is_some(), "unknown stream ID: {id}"); + }); + + StreamGuard { + sender: req_tx, + queue_depth, + permit, + } + } + + /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a + /// bidirectional GetPage stream, then forwards requests and responses between callers and the + /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be + /// atomic with pool stream acquisition. + /// + /// The task exits when the request channel is closed, or on a stream error. The caller is + /// responsible for removing the stream from the pool on exit. + async fn run_stream( + client_pool: Arc, + mut caller_rx: RequestReceiver, + ) -> anyhow::Result<()> { + // Acquire a client from the pool and create a stream. + let mut client = client_pool.get().await?; + + let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); + let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); + let mut resp_stream = client.get_pages(req_stream).await?; + + // Track caller response channels by request ID. If the task returns early, these response + // channels will be dropped and the waiting callers will receive an error. + let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH); + + // Process requests and responses. + loop { + // NB: this can trip if the server doesn't respond to a request, so only debug_assert. + debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow"); + + tokio::select! { + // Receive requests from callers and send them to the stream. + req = caller_rx.recv() => { + // Shut down if request channel is closed. + let Some((req, resp_tx)) = req else { + return Ok(()); + }; + + // Store the response channel by request ID. + if callers.contains_key(&req.request_id) { + // Error on request ID duplicates. Ignore callers that went away. + _ = resp_tx.send(Err(tonic::Status::invalid_argument( + format!("duplicate request ID: {}", req.request_id), + ))); + continue; + } + callers.insert(req.request_id, resp_tx); + + // Send the request on the stream. Bail out if the send fails. + req_tx.send(req).await.map_err(|_| { + tonic::Status::unavailable("stream closed") + })?; + } + + // Receive responses from the stream and send them to callers. + resp = resp_stream.next() => { + // Shut down if the stream is closed, and bail out on stream errors. + let Some(resp) = resp.transpose()? else { + return Ok(()) + }; + + // Send the response to the caller. Ignore errors if the caller went away. + let Some(resp_tx) = callers.remove(&resp.request_id) else { + warn!("received response for unknown request ID: {}", resp.request_id); + continue; + }; + _ = resp_tx.send(Ok(resp)); + } + } + } + } +} + +/// A pooled stream reference. Can be used to send a single request, to properly enforce queue +/// depth. Queue depth is already reserved and will be returned on drop. +pub struct StreamGuard { + sender: RequestSender, + queue_depth: Arc, + permit: OwnedSemaphorePermit, +} + +impl StreamGuard { + /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only + /// valid for a single request (to enforce queue depth). This also drops the guard on return and + /// returns the queue depth quota to the pool. + /// + /// The `GetPageRequest::request_id` must be unique across in-flight requests. + /// + /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status` + /// to avoid tearing down the stream for per-request errors. Callers must check this. + pub async fn send( + self, + req: page_api::GetPageRequest, + ) -> tonic::Result { + let (resp_tx, resp_rx) = oneshot::channel(); + + self.sender + .send((req, resp_tx)) + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))?; + + resp_rx + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))? + } +} + +impl Drop for StreamGuard { + fn drop(&mut self) { + // Release the queue depth reservation on drop. This can prematurely decrement it if dropped + // before the response is received, but that's okay. + let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst); + assert!(prev_queue_depth > 0, "stream queue underflow"); + + _ = self.permit; // returned on drop, referenced for visibility + } +} diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 6cce2844c7..838d00e490 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,10 +1,101 @@ +use std::str::FromStr; + use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::IndexPart; +use pageserver::tenant::{ + IndexPart, + layer_map::{LayerMap, SearchResult}, + remote_timeline_client::remote_layer_path, + storage_layer::{PersistentLayerDesc, ReadableLayerWeak}, +}; +use pageserver_api::key::Key; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::TenantShardId, +}; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { - Dump { path: Utf8PathBuf }, + Dump { + path: Utf8PathBuf, + }, + /// Find all layers that need to be searched to construct the given page at the given LSN. + Search { + #[arg(long)] + tenant_id: String, + #[arg(long)] + timeline_id: String, + #[arg(long)] + path: Utf8PathBuf, + #[arg(long)] + key: String, + #[arg(long)] + lsn: String, + }, +} + +async fn search_layers( + tenant_id: &str, + timeline_id: &str, + path: &Utf8PathBuf, + key: &str, + lsn: &str, +) -> anyhow::Result<()> { + let tenant_id = TenantId::from_str(tenant_id).unwrap(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::from_str(timeline_id).unwrap(); + let index_json = { + let bytes = tokio::fs::read(path).await?; + IndexPart::from_json_bytes(&bytes).unwrap() + }; + let mut layer_map = LayerMap::default(); + { + let mut updates = layer_map.batch_update(); + for (key, value) in index_json.layer_metadata.iter() { + updates.insert_historic(PersistentLayerDesc::from_filename( + tenant_shard_id, + timeline_id, + key.clone(), + value.file_size, + )); + } + } + let key = Key::from_hex(key)?; + + let lsn = Lsn::from_str(lsn).unwrap(); + let mut end_lsn = lsn; + loop { + let result = layer_map.search(key, end_lsn); + match result { + Some(SearchResult { layer, lsn_floor }) => { + let disk_layer = match layer { + ReadableLayerWeak::PersistentLayer(layer) => layer, + ReadableLayerWeak::InMemoryLayer(_) => { + anyhow::bail!("unexpected in-memory layer") + } + }; + + let metadata = index_json + .layer_metadata + .get(&disk_layer.layer_name()) + .unwrap(); + println!( + "{}", + remote_layer_path( + &tenant_id, + &timeline_id, + metadata.shard, + &disk_layer.layer_name(), + metadata.generation + ) + ); + end_lsn = lsn_floor; + } + None => break, + } + } + Ok(()) } pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { @@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { println!("{output}"); Ok(()) } + IndexPartCmd::Search { + tenant_id, + timeline_id, + path, + key, + lsn, + } => search_layers(tenant_id, timeline_id, path, key, lsn).await, } } diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 65e41540b8..6523d00d3d 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -1,23 +1,151 @@ -use anyhow::Result; +use anyhow::Context as _; use futures::{Stream, StreamExt as _, TryStreamExt as _}; use tokio::io::AsyncRead; use tokio_util::io::StreamReader; +use tonic::codec::CompressionEncoding; use tonic::metadata::AsciiMetadataValue; -use tonic::metadata::errors::InvalidMetadataValue; -use tonic::transport::Channel; -use tonic::{Request, Streaming}; +use tonic::service::Interceptor; +use tonic::service::interceptor::InterceptedService; +use tonic::transport::{Channel, Endpoint}; -use utils::id::TenantId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; -use crate::model; +use crate::model::*; use crate::proto; -/// -/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These -/// headers are required at the pageserver. -/// +/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain +/// types from `model` rather than generated Protobuf types. +pub struct Client { + inner: proto::PageServiceClient>, +} + +impl Client { + /// Connects to the given gRPC endpoint. + pub async fn connect( + endpoint: E, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?; + let channel = endpoint.connect().await?; + Self::new( + channel, + tenant_id, + timeline_id, + shard_id, + auth_token, + compression, + ) + } + + /// Creates a new client using the given gRPC channel. + pub fn new( + channel: Channel, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result { + let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?; + let mut inner = proto::PageServiceClient::with_interceptor(channel, auth); + + if let Some(compression) = compression { + // TODO: benchmark this (including network latency). + inner = inner + .accept_compressed(compression) + .send_compressed(compression); + } + + Ok(Self { inner }) + } + + /// Returns whether a relation exists. + pub async fn check_rel_exists( + &mut self, + req: CheckRelExistsRequest, + ) -> tonic::Result { + let req = proto::CheckRelExistsRequest::from(req); + let resp = self.inner.check_rel_exists(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches a base backup. + pub async fn get_base_backup( + &mut self, + req: GetBaseBackupRequest, + ) -> tonic::Result> { + let req = proto::GetBaseBackupRequest::from(req); + let chunks = self.inner.get_base_backup(req).await?.into_inner(); + Ok(StreamReader::new( + chunks + .map_ok(|resp| resp.chunk) + .map_err(std::io::Error::other), + )) + } + + /// Returns the total size of a database, as # of bytes. + pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result { + let req = proto::GetDbSizeRequest::from(req); + let resp = self.inner.get_db_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches pages. + /// + /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are + /// typically returned as status_code instead of errors, to avoid tearing down the entire stream + /// via a tonic::Status error. + pub async fn get_pages( + &mut self, + reqs: impl Stream + Send + 'static, + ) -> tonic::Result> + Send + 'static> { + let reqs = reqs.map(proto::GetPageRequest::from); + let resps = self.inner.get_pages(reqs).await?.into_inner(); + Ok(resps.map_ok(GetPageResponse::from)) + } + + /// Returns the size of a relation, as # of blocks. + pub async fn get_rel_size( + &mut self, + req: GetRelSizeRequest, + ) -> tonic::Result { + let req = proto::GetRelSizeRequest::from(req); + let resp = self.inner.get_rel_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches an SLRU segment. + pub async fn get_slru_segment( + &mut self, + req: GetSlruSegmentRequest, + ) -> tonic::Result { + let req = proto::GetSlruSegmentRequest::from(req); + let resp = self.inner.get_slru_segment(req).await?.into_inner(); + Ok(resp.try_into()?) + } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result { + let req = proto::LeaseLsnRequest::from(req); + let resp = self.inner.lease_lsn(req).await?.into_inner(); + Ok(resp.try_into()?) + } +} + +/// Adds authentication metadata to gRPC requests. #[derive(Clone)] struct AuthInterceptor { tenant_id: AsciiMetadataValue, @@ -30,174 +158,29 @@ impl AuthInterceptor { fn new( tenant_id: TenantId, timeline_id: TimelineId, - auth_token: Option, shard_id: ShardIndex, - ) -> Result { - let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?; - let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?; - let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?; - - let auth_header: Option = match auth_token { - Some(token) => Some(format!("Bearer {token}").try_into()?), - None => None, - }; - + auth_token: Option, + ) -> anyhow::Result { Ok(Self { - tenant_id: tenant_ascii, - shard_id: shard_ascii, - timeline_id: timeline_ascii, - auth_header, + tenant_id: tenant_id.to_string().try_into()?, + timeline_id: timeline_id.to_string().try_into()?, + shard_id: shard_id.to_string().try_into()?, + auth_header: auth_token + .map(|token| format!("Bearer {token}").try_into()) + .transpose()?, }) } } -impl tonic::service::Interceptor for AuthInterceptor { - fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { - req.metadata_mut() - .insert("neon-tenant-id", self.tenant_id.clone()); - req.metadata_mut() - .insert("neon-shard-id", self.shard_id.clone()); - req.metadata_mut() - .insert("neon-timeline-id", self.timeline_id.clone()); - if let Some(auth_header) = &self.auth_header { - req.metadata_mut() - .insert("authorization", auth_header.clone()); +impl Interceptor for AuthInterceptor { + fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result> { + let metadata = req.metadata_mut(); + metadata.insert("neon-tenant-id", self.tenant_id.clone()); + metadata.insert("neon-timeline-id", self.timeline_id.clone()); + metadata.insert("neon-shard-id", self.shard_id.clone()); + if let Some(ref auth_header) = self.auth_header { + metadata.insert("authorization", auth_header.clone()); } Ok(req) } } - -#[derive(Clone)] -pub struct Client { - client: proto::PageServiceClient< - tonic::service::interceptor::InterceptedService, - >, -} - -impl Client { - pub async fn new + Send + Sync + 'static>( - into_endpoint: T, - tenant_id: TenantId, - timeline_id: TimelineId, - shard_id: ShardIndex, - auth_header: Option, - compression: Option, - ) -> anyhow::Result { - let endpoint: tonic::transport::Endpoint = into_endpoint - .try_into() - .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?; - let channel = endpoint.connect().await?; - let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id) - .map_err(|e| anyhow::anyhow!(e.to_string()))?; - let mut client = proto::PageServiceClient::with_interceptor(channel, auth); - - if let Some(compression) = compression { - // TODO: benchmark this (including network latency). - client = client - .accept_compressed(compression) - .send_compressed(compression); - } - - Ok(Self { client }) - } - - /// Returns whether a relation exists. - pub async fn check_rel_exists( - &mut self, - req: model::CheckRelExistsRequest, - ) -> Result { - let proto_req = proto::CheckRelExistsRequest::from(req); - - let response = self.client.check_rel_exists(proto_req).await?; - - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches a base backup. - pub async fn get_base_backup( - &mut self, - req: model::GetBaseBackupRequest, - ) -> Result, tonic::Status> { - let req = proto::GetBaseBackupRequest::from(req); - let chunks = self.client.get_base_backup(req).await?.into_inner(); - let reader = StreamReader::new( - chunks - .map_ok(|resp| resp.chunk) - .map_err(std::io::Error::other), - ); - Ok(reader) - } - - /// Returns the total size of a database, as # of bytes. - pub async fn get_db_size( - &mut self, - req: model::GetDbSizeRequest, - ) -> Result { - let proto_req = proto::GetDbSizeRequest::from(req); - - let response = self.client.get_db_size(proto_req).await?; - Ok(response.into_inner().into()) - } - - /// Fetches pages. - /// - /// This is implemented as a bidirectional streaming RPC for performance. - /// Per-request errors are often returned as status_code instead of errors, - /// to avoid tearing down the entire stream via tonic::Status. - pub async fn get_pages( - &mut self, - inbound: ReqSt, - ) -> Result< - impl Stream> + Send + 'static, - tonic::Status, - > - where - ReqSt: Stream + Send + 'static, - { - let outbound_proto = inbound.map(|domain_req| domain_req.into()); - - let req_new = Request::new(outbound_proto); - - let response_stream: Streaming = - self.client.get_pages(req_new).await?.into_inner(); - - let domain_stream = response_stream.map_ok(model::GetPageResponse::from); - - Ok(domain_stream) - } - - /// Returns the size of a relation, as # of blocks. - pub async fn get_rel_size( - &mut self, - req: model::GetRelSizeRequest, - ) -> Result { - let proto_req = proto::GetRelSizeRequest::from(req); - let response = self.client.get_rel_size(proto_req).await?; - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches an SLRU segment. - pub async fn get_slru_segment( - &mut self, - req: model::GetSlruSegmentRequest, - ) -> Result { - let proto_req = proto::GetSlruSegmentRequest::from(req); - let response = self.client.get_slru_segment(proto_req).await?; - Ok(response.into_inner().try_into()?) - } - - /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't - /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. - /// - /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be - /// acquired because the LSN has already been garbage collected. - pub async fn lease_lsn( - &mut self, - req: model::LeaseLsnRequest, - ) -> Result { - let req = proto::LeaseLsnRequest::from(req); - Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) - } -} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 4b7a70504a..c14bb73136 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -326,7 +326,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let inner = page_api::Client::new( + let inner = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a297819e9b..f14caf548c 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -625,7 +625,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b2ac27b56d..99d7e0ca3a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -145,7 +145,7 @@ pub struct PageServerConf { pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, @@ -706,9 +706,12 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { + use std::time::Duration; + use camino::Utf8PathBuf; + use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder}; use rstest::rstest; - use utils::id::NodeId; + use utils::{id::NodeId, serde_percent::Percent}; use super::PageServerConf; @@ -807,4 +810,70 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } + + #[rstest] + #[ + case::omit_the_whole_config( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: Default::default(), + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" + control_plane_api = "http://localhost:6666" + "#, + )] + #[ + case::omit_enabled_field( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 1_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" } + "#, + )] + #[case::disabled( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: false, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { enabled = false } + "# + )] + fn test_config_disk_usage_based_eviction_is_valid( + #[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig, + #[case] input: &str, + ) { + let config_toml = toml_edit::de::from_str::(input) + .expect("disk_usage_based_eviction is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap(); + let disk_usage_based_eviction = config.disk_usage_based_eviction; + assert_eq!( + expected_disk_usage_based_eviction, + disk_usage_based_eviction + ); + } } diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index eba773272a..16d42b6fe4 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -99,7 +99,7 @@ pub(super) async fn upload_metrics_bucket( // Compose object path let datetime: DateTime = SystemTime::now().into(); - let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/hour=%H/%H:%M:%SZ"); let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; // Set up a gzip writer into a buffer @@ -109,7 +109,7 @@ pub(super) async fn upload_metrics_bucket( // Serialize and write into compressed buffer let started_at = std::time::Instant::now(); - for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) { + for res in serialize_in_chunks_ndjson(CHUNK_SIZE, metrics, idempotency_keys) { let (_chunk, body) = res?; gzip_writer.write_all(&body).await?; } @@ -216,6 +216,86 @@ fn serialize_in_chunks<'a>( } } +/// Serializes the input metrics as NDJSON in chunks of chunk_size. Each event +/// is serialized as a separate JSON object on its own line. The provided +/// idempotency keys are injected into the corresponding metric events (reused +/// across different metrics sinks), and must have the same length as input. +fn serialize_in_chunks_ndjson<'a>( + chunk_size: usize, + input: &'a [NewRawMetric], + idempotency_keys: &'a [IdempotencyKey<'a>], +) -> impl ExactSizeIterator> + 'a +{ + use bytes::BufMut; + + assert_eq!(input.len(), idempotency_keys.len()); + + struct Iter<'a> { + inner: std::slice::Chunks<'a, NewRawMetric>, + idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, + chunk_size: usize, + + // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries + buffer: bytes::BytesMut, + // chunk amount of events are reused to produce the serialized document + scratch: Vec>, + } + + impl<'a> Iterator for Iter<'a> { + type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>; + + fn next(&mut self) -> Option { + let chunk = self.inner.next()?; + + if self.scratch.is_empty() { + // first round: create events with N strings + self.scratch.extend( + chunk + .iter() + .zip(&mut self.idempotency_keys) + .map(|(raw_metric, key)| raw_metric.as_event(key)), + ); + } else { + // next rounds: update_in_place to reuse allocations + assert_eq!(self.scratch.len(), self.chunk_size); + itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) + .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); + } + + // Serialize each event as NDJSON (one JSON object per line) + for event in self.scratch[..chunk.len()].iter() { + let res = serde_json::to_writer((&mut self.buffer).writer(), event); + if let Err(e) = res { + return Some(Err(e)); + } + // Add newline after each event to follow NDJSON format + self.buffer.put_u8(b'\n'); + } + + Some(Ok((chunk, self.buffer.split().freeze()))) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + } + + impl ExactSizeIterator for Iter<'_> {} + + let buffer = bytes::BytesMut::new(); + let inner = input.chunks(chunk_size); + let idempotency_keys = idempotency_keys.iter(); + let scratch = Vec::new(); + + Iter { + inner, + idempotency_keys, + chunk_size, + buffer, + scratch, + } +} + trait RawMetricExt { fn as_event(&self, key: &IdempotencyKey<'_>) -> Event; fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>); @@ -479,6 +559,43 @@ mod tests { } } + #[test] + fn chunked_serialization_ndjson() { + let examples = metric_samples(); + assert!(examples.len() > 1); + + let now = Utc::now(); + let idempotency_keys = (0..examples.len()) + .map(|i| FixedGen::new(now, "1", i as u16).generate()) + .collect::>(); + + // Parse NDJSON format - each line is a separate JSON object + let parse_ndjson = |body: &[u8]| -> Vec> { + let body_str = std::str::from_utf8(body).unwrap(); + body_str + .trim_end_matches('\n') + .lines() + .filter(|line| !line.is_empty()) + .map(|line| serde_json::from_str::>(line).unwrap()) + .collect() + }; + + let correct = serialize_in_chunks_ndjson(examples.len(), &examples, &idempotency_keys) + .map(|res| res.unwrap().1) + .flat_map(|body| parse_ndjson(&body)) + .collect::>(); + + for chunk_size in 1..examples.len() { + let actual = serialize_in_chunks_ndjson(chunk_size, &examples, &idempotency_keys) + .map(|res| res.unwrap().1) + .flat_map(|body| parse_ndjson(&body)) + .collect::>(); + + // if these are equal, it means that multi-chunking version works as well + assert_eq!(correct, actual); + } + } + #[derive(Clone, Copy)] struct FixedGen<'a>(chrono::DateTime, &'a str, u16); diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f13b3709f5..f1d34664a8 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task( tenant_manager: Arc, background_jobs_barrier: completion::Barrier, ) -> Option { - let Some(task_config) = &conf.disk_usage_based_eviction else { + let task_config = &conf.disk_usage_based_eviction; + if !task_config.enabled { info!("disk usage based eviction task not configured"); return None; }; @@ -458,6 +459,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( match next { Ok(Ok(file_size)) => { METRICS.layers_evicted.inc(); + /*BEGIN_HADRON */ + METRICS.bytes_evicted.inc_by(file_size); + /*END_HADRON */ usage_assumed.add_available_bytes(file_size); } Ok(Err(( @@ -1265,6 +1269,7 @@ mod filesystem_level_usage { #[cfg(feature = "testing")] mock_statvfs: None, eviction_order: pageserver_api::config::EvictionOrder::default(), + enabled: true, }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 3080b0db34..65cac8eea1 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,4 +1,8 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + sync::{Arc, atomic::AtomicBool}, + time::Duration, +}; use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; @@ -6,12 +10,13 @@ use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError, PostHogFlagFilterPropertyValue, }; +use rand::Rng; use remote_storage::RemoteStorageKind; use serde_json::json; use tokio_util::sync::CancellationToken; use utils::id::TenantId; -use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; +use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION, tenant::TenantShard}; const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); @@ -138,6 +143,7 @@ impl FeatureResolver { } Arc::new(properties) }; + let fake_tenants = { let mut tenants = Vec::new(); for i in 0..10 { @@ -147,9 +153,16 @@ impl FeatureResolver { conf.id, i ); + + let tenant_properties = PerTenantProperties { + remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)), + } + .into_posthog_properties(); + let properties = Self::collect_properties_inner( distinct_id.clone(), Some(&internal_properties), + &tenant_properties, ); tenants.push(CaptureEvent { event: "initial_tenant_report".to_string(), @@ -183,6 +196,7 @@ impl FeatureResolver { fn collect_properties_inner( tenant_id: String, internal_properties: Option<&HashMap>, + tenant_properties: &HashMap, ) -> HashMap { let mut properties = HashMap::new(); if let Some(internal_properties) = internal_properties { @@ -194,6 +208,9 @@ impl FeatureResolver { "tenant_id".to_string(), PostHogFlagFilterPropertyValue::String(tenant_id), ); + for (key, value) in tenant_properties.iter() { + properties.insert(key.clone(), value.clone()); + } properties } @@ -201,8 +218,13 @@ impl FeatureResolver { pub(crate) fn collect_properties( &self, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> HashMap { - Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref()) + Self::collect_properties_inner( + tenant_id.to_string(), + self.internal_properties.as_deref(), + tenant_properties, + ) } /// Evaluate a multivariate feature flag. Currently, we do not support any properties. @@ -214,6 +236,7 @@ impl FeatureResolver { &self, flag_key: &str, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> Result { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { @@ -224,7 +247,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), - &self.collect_properties(tenant_id), + &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(value) => { @@ -257,6 +280,7 @@ impl FeatureResolver { &self, flag_key: &str, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> Result<(), PostHogEvaluationError> { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { @@ -271,7 +295,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), - &self.collect_properties(tenant_id), + &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(()) => { @@ -317,3 +341,93 @@ impl FeatureResolver { .store(Arc::new(force_overrides)); } } + +struct PerTenantProperties { + pub remote_size_mb: Option, +} + +impl PerTenantProperties { + pub fn into_posthog_properties(self) -> HashMap { + let mut properties = HashMap::new(); + if let Some(remote_size_mb) = self.remote_size_mb { + properties.insert( + "tenant_remote_size_mb".to_string(), + PostHogFlagFilterPropertyValue::Number(remote_size_mb), + ); + } + properties + } +} + +pub struct TenantFeatureResolver { + inner: FeatureResolver, + tenant_id: TenantId, + cached_tenant_properties: ArcSwap>, + + // Add feature flag on the critical path below. + // + // If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of + // resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the + // housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions. + pub feature_test_remote_size_flag: AtomicBool, +} + +impl TenantFeatureResolver { + pub fn new(inner: FeatureResolver, tenant_id: TenantId) -> Self { + Self { + inner, + tenant_id, + cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())), + feature_test_remote_size_flag: AtomicBool::new(false), + } + } + + pub fn evaluate_multivariate(&self, flag_key: &str) -> Result { + self.inner.evaluate_multivariate( + flag_key, + self.tenant_id, + &self.cached_tenant_properties.load(), + ) + } + + pub fn evaluate_boolean(&self, flag_key: &str) -> Result<(), PostHogEvaluationError> { + self.inner.evaluate_boolean( + flag_key, + self.tenant_id, + &self.cached_tenant_properties.load(), + ) + } + + pub fn collect_properties(&self) -> HashMap { + self.inner + .collect_properties(self.tenant_id, &self.cached_tenant_properties.load()) + } + + pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result { + self.inner.is_feature_flag_boolean(flag_key) + } + + /// Refresh the cached properties and flags on the critical path. + pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { + let mut remote_size_mb = None; + for timeline in tenant_shard.list_timelines() { + let size = timeline.metrics.resident_physical_size_get(); + if size == 0 { + remote_size_mb = None; + } + if let Some(ref mut remote_size_mb) = remote_size_mb { + *remote_size_mb += size as f64 / 1024.0 / 1024.0; + } + } + self.cached_tenant_properties.store(Arc::new( + PerTenantProperties { remote_size_mb }.into_posthog_properties(), + )); + + // BEGIN: Update the feature flag on the critical path. + self.feature_test_remote_size_flag.store( + self.evaluate_boolean("test-remote-size-flag").is_ok(), + std::sync::atomic::Ordering::Relaxed, + ); + // END: Update the feature flag on the critical path. + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 119275f885..2995a37089 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -61,6 +61,7 @@ use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::feature_resolver::FeatureResolver; +use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; @@ -2438,6 +2439,7 @@ async fn timeline_offload_handler( .map_err(|e| { match e { OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()), + OffloadError::AlreadyInProgress => ApiError::Conflict("Timeline already being offloaded or deleted".into()), _ => ApiError::InternalServerError(anyhow!(e)) } })?; @@ -2500,10 +2502,7 @@ async fn timeline_checkpoint_handler( .map_err(|e| match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), - CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), - CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -3629,6 +3628,17 @@ async fn activate_post_import_handler( .await } +// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts +// after we manually rectify situations such as local SSD data loss. We will eventually automate this. +async fn hadron_reset_alert_gauges( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + LOCAL_DATA_LOSS_SUSPECTED.set(0); + json_response(StatusCode::OK, ()) +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3697,23 +3707,25 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); + // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) + // and we don't need to worry about it for now. + let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { - let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); + let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else if as_type.as_deref() == Some("multivariate") { - let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); + let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { // Auto infer the type of the feature flag. let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?; if is_boolean { - let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); + let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { - let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); + let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } } @@ -4153,5 +4165,8 @@ pub fn make_router( .post("/v1/feature_flag_spec", |r| { api_handler(r, update_feature_flag_spec) }) + .post("/hadron-internal/reset_alert_gauges", |r| { + api_handler(r, hadron_reset_alert_gauges) + }) .any(handler_404)) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 985dce17e2..4680d20697 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use std::cell::Cell; use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; @@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); -// Buckets for background operation duration in seconds, like compaction, GC, size calculation. +/* BEGIN_HADRON */ +pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_active_storage_operations_count", + "Count of active storage operations with operation, tenant and timeline dimensions", + &["operation", "tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); +/*END_HADRON */ + +// Buckets for background operations like compaction, GC, size calculation const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { @@ -2801,6 +2813,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); +pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_local_data_loss_suspected", + "Non-zero value indicates that pageserver local data loss is suspected (and highly likely)." + ) + .expect("failed to define a metric") +}); + +// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish +// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by +// Postgres compute to fetch data from pageservers. +// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the +// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify +// issues with compute configuration, caused by either the compute node itself being stuck in the wrong +// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration +// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests +// are indicative of bugs (and unavailability). +pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_misrouted_pagestream_requests_total", + "Number of pageserver pagestream requests that were routed to the wrong pageserver" + ) + .expect("failed to define a metric") +}); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -3039,13 +3076,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, + stopped: Cell, } impl StorageTimeMetricsTimer { fn new(metrics: StorageTimeMetrics) -> Self { + /*BEGIN_HADRON */ + // record the active operation as the timer starts + metrics.timeline_active_count.inc(); + /*END_HADRON */ Self { metrics, start: Instant::now(), + stopped: Cell::new(false), } } @@ -3061,6 +3104,10 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(seconds); + /* BEGIN_HADRON*/ + self.stopped.set(true); + self.metrics.timeline_active_count.dec(); + /*END_HADRON */ duration } @@ -3071,6 +3118,16 @@ impl StorageTimeMetricsTimer { } } +/*BEGIN_HADRON */ +impl Drop for StorageTimeMetricsTimer { + fn drop(&mut self) { + if !self.stopped.get() { + self.metrics.timeline_active_count.dec(); + } + } +} +/*END_HADRON */ + pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); impl Drop for AlwaysRecordingStorageTimeMetricsTimer { @@ -3096,6 +3153,10 @@ pub(crate) struct StorageTimeMetrics { timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id timeline_count: IntCounter, + /*BEGIN_HADRON */ + /// Number of active operations per operation, tenant_id, and timeline_id + timeline_active_count: IntGauge, + /*END_HADRON */ /// Global histogram having only the "operation" label. global_histogram: Histogram, } @@ -3115,6 +3176,11 @@ impl StorageTimeMetrics { let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); + /*BEGIN_HADRON */ + let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE + .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) + .unwrap(); + /*END_HADRON */ let global_histogram = STORAGE_TIME_GLOBAL .get_metric_with_label_values(&[operation]) .unwrap(); @@ -3122,6 +3188,7 @@ impl StorageTimeMetrics { StorageTimeMetrics { timeline_sum, timeline_count, + timeline_active_count, global_histogram, } } @@ -3529,6 +3596,14 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + /* BEGIN_HADRON */ + let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[ + op, + tenant_id, + shard_id, + timeline_id, + ]); + /*END_HADRON */ } for op in StorageIoSizeOperation::VARIANTS { @@ -4321,6 +4396,9 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) layers_collected: IntCounter, pub(crate) layers_selected: IntCounter, pub(crate) layers_evicted: IntCounter, + /*BEGIN_HADRON */ + pub(crate) bytes_evicted: IntCounter, + /*END_HADRON */ } impl Default for Metrics { @@ -4357,12 +4435,21 @@ pub(crate) mod disk_usage_based_eviction { ) .unwrap(); + /*BEGIN_HADRON */ + let bytes_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_bytes_total", + "Amount of bytes successfully evicted" + ) + .unwrap(); + /*END_HADRON */ + Self { tenant_collection_time, tenant_layer_count, layers_collected, layers_selected, layers_evicted, + bytes_evicted, } } } @@ -4482,6 +4569,7 @@ pub fn preinitialize_metrics( &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, + &MISROUTED_PAGESTREAM_REQUESTS, ] .into_iter() .for_each(|c| { @@ -4519,6 +4607,7 @@ pub fn preinitialize_metrics( // gauges WALRECEIVER_ACTIVE_MANAGERS.get(); + LOCAL_DATA_LOSS_SUSPECTED.get(); // histograms [ diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1d824ac846..6b614deac8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -50,6 +50,7 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tonic::service::Interceptor as _; +use tonic::transport::server::TcpConnectInfo; use tracing::*; use utils::auth::{Claims, Scope, SwappableJwtAuth}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -69,7 +70,7 @@ use crate::context::{ }; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, - SmgrOpTimer, TimelineMetrics, + MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ @@ -90,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation}; /// is not yet in state [`TenantState::Active`]. /// /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// HADRON: reduced timeout and we will retry in Cache::get(). +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); /// Threshold at which to log slow GetPage requests. const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); @@ -1127,6 +1129,7 @@ impl PageServerHandler { // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration // and talk to a different pageserver. + MISROUTED_PAGESTREAM_REQUESTS.inc(); return respond_error!( span, PageStreamError::Reconnect( @@ -3685,8 +3688,15 @@ impl proto::PageService for GrpcPageServiceHandler { yield match result { Ok(resp) => resp, // Convert per-request errors to GetPageResponses as appropriate, or terminate - // the stream with a tonic::Status. - Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(), + // the stream with a tonic::Status. Log the error regardless, since + // ObservabilityLayer can't automatically log stream errors. + Err(status) => { + // TODO: it would be nice if we could propagate the get_page() fields here. + span.in_scope(|| { + warn!("request failed with {:?}: {}", status.code(), status.message()); + }); + page_api::GetPageResponse::try_from_status(status, req_id)?.into() + } } } }; @@ -3824,40 +3834,85 @@ impl tonic::server::NamedService for Observabili const NAME: &'static str = S::NAME; // propagate inner service name } -impl tower::Service> for ObservabilityLayerService +impl tower::Service> for ObservabilityLayerService where - S: tower::Service>, + S: tower::Service, Response = http::Response> + Send, S::Future: Send + 'static, { type Response = S::Response; type Error = S::Error; type Future = BoxFuture<'static, Result>; - fn call(&mut self, mut req: http::Request) -> Self::Future { + fn call(&mut self, mut req: http::Request) -> Self::Future { // Record the request start time as a request extension. // // TODO: we should start a timer here instead, but it currently requires a timeline handle // and SmgrQueryType, which we don't have yet. Refactor it to provide it later. req.extensions_mut().insert(ReceivedAt(Instant::now())); - // Create a basic tracing span. Enter the span for the current thread (to use it for inner - // sync code like interceptors), and instrument the future (to use it for inner async code - // like the page service itself). + // Extract the peer address and gRPC method. + let peer = req + .extensions() + .get::() + .and_then(|info| info.remote_addr()) + .map(|addr| addr.to_string()) + .unwrap_or_default(); + + let method = req + .uri() + .path() + .split('/') + .nth(2) + .unwrap_or(req.uri().path()) + .to_string(); + + // Create a basic tracing span. // - // The instrument() call below is not sufficient. It only affects the returned future, and - // only takes effect when the caller polls it. Any sync code executed when we call - // self.inner.call() below (such as interceptors) runs outside of the returned future, and - // is not affected by it. We therefore have to enter the span on the current thread too. + // Enter the span for the current thread and instrument the future. It is not sufficient to + // only instrument the future, since it only takes effect after the future is returned and + // polled, not when the inner service is called below (e.g. during interceptor execution). let span = info_span!( "grpc:pageservice", - // Set by TenantMetadataInterceptor. + // These will be populated by TenantMetadataInterceptor. tenant_id = field::Empty, timeline_id = field::Empty, shard_id = field::Empty, + // NB: empty fields must be listed first above. Otherwise, the field names will be + // clobbered when the empty fields are populated. They will be output last regardless. + %peer, + %method, ); let _guard = span.enter(); - Box::pin(self.inner.call(req).instrument(span.clone())) + // Construct a future for calling the inner service, but don't await it. This avoids having + // to clone the inner service into the future below. + let call = self.inner.call(req); + + async move { + // Await the inner service call. + let result = call.await; + + // Log gRPC error statuses. This won't include request info from handler spans, but it + // will catch all errors (even those emitted before handler spans are constructed). Only + // unary request errors are logged here, not streaming response errors. + if let Ok(ref resp) = result + && let Some(status) = tonic::Status::from_header_map(resp.headers()) + && status.code() != tonic::Code::Ok + { + // TODO: it would be nice if we could propagate the handler span's request fields + // here. This could e.g. be done by attaching the request fields to + // tonic::Status::metadata via a proc macro. + warn!( + "request failed with {:?}: {}", + status.code(), + status.message() + ); + } + + result + } + .instrument(span.clone()) + .boxed() } fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 31f38d485f..8532a6938f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError { Cancelled, } +impl CollectKeySpaceError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + CollectKeySpaceError::Decode(_) => false, + CollectKeySpaceError::PageRead(e) => e.is_cancel(), + CollectKeySpaceError::Cancelled => true, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + CollectKeySpaceError::Decode(e) => anyhow::Error::new(e), + CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e), + CollectKeySpaceError::Cancelled => anyhow::Error::new(self), + } + } +} + impl From for CollectKeySpaceError { fn from(err: PageReconstructError) -> Self { match err { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 70a4120c71..f576119db8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -86,7 +86,7 @@ use crate::context; use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::feature_resolver::FeatureResolver; +use crate::feature_resolver::{FeatureResolver, TenantFeatureResolver}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, @@ -142,6 +142,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; +#[cfg(test)] +pub mod debug; + pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -388,7 +391,7 @@ pub struct TenantShard { l0_flush_global_state: L0FlushGlobalState, - pub(crate) feature_resolver: FeatureResolver, + pub(crate) feature_resolver: Arc, } impl std::fmt::Debug for TenantShard { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -3265,7 +3268,7 @@ impl TenantShard { }; let gc_compaction_strategy = self .feature_resolver - .evaluate_multivariate("gc-comapction-strategy", self.tenant_shard_id.tenant_id) + .evaluate_multivariate("gc-comapction-strategy") .ok(); let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy { info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy) @@ -3287,7 +3290,10 @@ impl TenantShard { .or_else(|err| match err { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), - err => Err(err), + OffloadError::AlreadyInProgress => Ok(()), + OffloadError::Cancelled => Err(CompactionError::ShuttingDown), + // don't break the anyhow chain + OffloadError::Other(err) => Err(CompactionError::Other(err)), })?; } @@ -3318,23 +3324,12 @@ impl TenantShard { match err { err if err.is_cancel() => {} CompactionError::ShuttingDown => (), - // Offload failures don't trip the circuit breaker, since they're cheap to retry and - // shouldn't block compaction. - CompactionError::Offload(_) => {} - CompactionError::CollectKeySpaceError(err) => { - // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, err); - } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } - CompactionError::AlreadyRunning(_) => {} } } @@ -3410,6 +3405,9 @@ impl TenantShard { if let Some(ref walredo_mgr) = self.walredo_mgr { walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); } + + // Update the feature resolver with the latest tenant-spcific data. + self.feature_resolver.refresh_properties_and_flags(self); } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -4498,7 +4496,10 @@ impl TenantShard { gc_block: Default::default(), l0_flush_global_state, basebackup_cache, - feature_resolver, + feature_resolver: Arc::new(TenantFeatureResolver::new( + feature_resolver, + tenant_shard_id.tenant_id, + )), } } @@ -6010,12 +6011,11 @@ pub(crate) mod harness { } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) async fn do_try_load( + pub(crate) async fn do_try_load_with_redo( &self, + walredo_mgr: Arc, ctx: &RequestContext, ) -> anyhow::Result> { - let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); - let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( @@ -6053,6 +6053,14 @@ pub(crate) mod harness { Ok(tenant) } + pub(crate) async fn do_try_load( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); + self.do_try_load_with_redo(walredo_mgr, ctx).await + } + pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } @@ -6129,7 +6137,7 @@ mod tests { use pageserver_api::keyspace::KeySpace; #[cfg(feature = "testing")] use pageserver_api::keyspace::KeySpaceRandomAccum; - use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease}; use pageserver_compaction::helpers::overlaps_with; #[cfg(feature = "testing")] use rand::SeedableRng; @@ -9391,6 +9399,14 @@ mod tests { .unwrap() .load() .await; + // set a non-zero lease length to test the feature + tenant + .update_tenant_config(|mut conf| { + conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH); + Ok(conf) + }) + .unwrap(); + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); diff --git a/pageserver/src/tenant/debug.rs b/pageserver/src/tenant/debug.rs new file mode 100644 index 0000000000..604f7f265e --- /dev/null +++ b/pageserver/src/tenant/debug.rs @@ -0,0 +1,366 @@ +use std::{ops::Range, str::FromStr, sync::Arc}; + +use crate::walredo::RedoAttemptType; +use base64::{Engine as _, engine::general_purpose::STANDARD}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use clap::Parser; +use itertools::Itertools; +use pageserver_api::{ + key::Key, + keyspace::KeySpace, + shard::{ShardIdentity, ShardStripeSize}, +}; +use postgres_ffi::PgMajorVersion; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn}; +use tracing::Instrument; +use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::{ShardCount, ShardIndex, ShardNumber}, +}; +use wal_decoder::models::record::NeonWalRecord; + +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::storage_layer::ValueReconstructState, + walredo::harness::RedoHarness, +}; + +use super::{ + WalRedoManager, WalredoManagerId, + harness::TenantHarness, + remote_timeline_client::LayerFileMetadata, + storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState}, +}; + +fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes { + // To match the logic in libs/wal_decoder/src/serialized_batch.rs + let mut new_image: BytesMut = img_bytes.into(); + if is_fpw && !page_is_new(&new_image) { + page_set_lsn(&mut new_image, next_record_lsn); + } + assert_eq!(new_image.len(), BLCKSZ as usize); + new_image.freeze() +} + +async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::unsharded(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + let contents = tokio::fs::read_to_string(input) + .await + .map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}"))) + .unwrap(); + let lines = contents.lines(); + let mut last_wal_lsn: Option = None; + let state = { + let mut state = ValueReconstructState::default(); + let mut is_fpw = false; + let mut is_first_line = true; + for line in lines { + if is_first_line { + is_first_line = false; + if line.trim() == "FPW" { + is_fpw = true; + } + continue; // Skip the first line. + } + // Each input line is in the "," format. + let (lsn_str, payload_b64) = line + .split_once(',') + .expect("Invalid input format: expected ','"); + + // Parse the LSN and decode the payload. + let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format"); + let bytes = Bytes::from( + STANDARD + .decode(payload_b64.trim()) + .expect("Invalid base64 payload"), + ); + + // The first line is considered the base image, the rest are WAL records. + if state.img.is_none() { + state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes))); + } else { + let wal_record = NeonWalRecord::Postgres { + will_init: false, + rec: bytes, + }; + state.records.push((lsn, wal_record)); + last_wal_lsn.replace(lsn); + } + } + state + }; + + assert!(state.img.is_some(), "No base image found"); + assert!(!state.records.is_empty(), "No WAL records found"); + let result = timeline + .reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + + eprintln!("final image: {:?}", STANDARD.encode(result)); + + Ok(()) +} + +async fn search_key( + tenant_id: TenantId, + timeline_id: TimelineId, + dir: String, + key: Key, + lsn: Lsn, +) -> anyhow::Result<()> { + let shard_index = ShardIndex { + shard_number: ShardNumber(0), + shard_count: ShardCount(4), + }; + + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::new( + shard_index.shard_number, + shard_index.shard_count, + ShardStripeSize(32768), + ) + .unwrap(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + + let mut delta_layers: Vec = Vec::new(); + let mut img_layer: Option = Option::None; + let mut dir = tokio::fs::read_dir(dir).await?; + loop { + let entry = dir.next_entry().await?; + if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() { + break; + } + let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap(); + let layer_name = match LayerName::from_str(path.file_name().unwrap()) { + Ok(name) => name, + Err(_) => { + eprintln!("Skipped invalid layer: {path}"); + continue; + } + }; + let layer = Layer::for_resident( + tenant.conf, + &timeline, + path.clone(), + layer_name, + LayerFileMetadata::new( + tokio::fs::metadata(path.clone()).await?.len(), + Generation::new(1), + shard_index, + ), + ); + if layer.layer_desc().is_delta() { + delta_layers.push(layer.into()); + } else if img_layer.is_none() { + img_layer = Some(layer.into()); + } else { + anyhow::bail!("Found multiple image layers"); + } + } + // sort delta layers based on the descending order of LSN + delta_layers.sort_by(|a, b| { + b.layer_desc() + .get_lsn_range() + .start + .cmp(&a.layer_desc().get_lsn_range().start) + }); + + let mut state = ValuesReconstructState::new(IoConcurrency::Sequential); + + let key_space = KeySpace::single(Range { + start: key, + end: key.next(), + }); + let lsn_range = Range { + start: img_layer + .as_ref() + .map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()), + end: lsn, + }; + for delta_layer in delta_layers.iter() { + delta_layer + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + } + + img_layer + .as_ref() + .unwrap() + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + + for (_key, result) in std::mem::take(&mut state.keys) { + let state = result.collect_pending_ios().await?; + if state.img.is_some() { + eprintln!( + "image: {}: {:x?}", + state.img.as_ref().unwrap().0, + STANDARD.encode(state.img.as_ref().unwrap().1.clone()) + ); + } + for delta in state.records.iter() { + match &delta.1 { + NeonWalRecord::Postgres { will_init, rec } => { + eprintln!( + "delta: {}: will_init: {}, {:x?}", + delta.0, + will_init, + STANDARD.encode(rec) + ); + } + _ => { + eprintln!("delta: {}: {:x?}", delta.0, delta.1); + } + } + } + + let result = timeline + .reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + eprintln!("final image: {lsn} : {result:?}"); + } + + Ok(()) +} + +/// Redo all WALs against the base image in the input file. Return the base64 encoded final image. +/// Each line in the input file must be in the form "," where: +/// * `` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`. +/// * `` is the base64‐encoded page image (first line) or WAL record (subsequent lines). +/// +/// The first line provides the base image of a page. The LSN is the LSN of "next record" following +/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping +/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200. +/// +/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the +/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here +/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should +/// be 0/1. +#[derive(Parser)] +struct RedoWalsCmd { + #[clap(long)] + input: String, + #[clap(long)] + key: String, +} + +#[tokio::test] +async fn test_redo_wals() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match RedoWalsCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let key = Key::from_hex(&cmd.key).unwrap(); + redo_wals(&cmd.input, key).await?; + + Ok(()) +} + +/// Search for a page at the given LSN in all layers of the data_dir. +/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image. +#[derive(Parser)] +struct SearchKeyCmd { + #[clap(long)] + tenant_id: String, + #[clap(long)] + timeline_id: String, + #[clap(long)] + data_dir: String, + #[clap(long)] + key: String, + #[clap(long)] + lsn: String, +} + +#[tokio::test] +async fn test_search_key() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match SearchKeyCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap(); + let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap(); + let key = Key::from_hex(&cmd.key).unwrap(); + let lsn = Lsn::from_str(&cmd.lsn).unwrap(); + search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index be18b40862..15853d3614 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -43,7 +43,7 @@ use crate::controller_upcall_client::{ }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; -use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; +use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, @@ -538,6 +538,21 @@ pub async fn init_tenant_mgr( // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?; + // Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants. + // This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node + // so the Storage Controller has not had the time to move tenants out. + let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes { + tenant_configs.is_empty() && !tenant_modes.is_empty() + } else { + false + }; + if data_loss_suspected { + tracing::error!( + "Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants" + ); + } + LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 }); + tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", tenant_configs.len(), diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index ffb4717d9f..f2fbf656a6 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>( let fs_size = usize::try_from(fs_size) .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; - + /* BEGIN_HADRON */ + let mut metadata = None; + match storage { + // Pass the file path as a storage metadata to minimize changes to neon. + // Otherwise, we need to change the upload interface. + GenericRemoteStorage::AzureBlob(s) => { + let block_size_mb = s.put_block_size_mb.unwrap_or(0); + if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 { + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + local_path.as_str(), + )])); + } + } + GenericRemoteStorage::LocalFs(_) => {} + GenericRemoteStorage::AwsS3(_) => {} + GenericRemoteStorage::Unreliable(_) => {} + }; + /* END_HADRON */ let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, remote_path, None, cancel) + .upload(reader, fs_size, remote_path, metadata, cancel) .await .with_context(|| format!("upload layer from local path '{local_path}'")) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..bcece5589a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -34,6 +34,21 @@ use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { let total_threads = TOKIO_WORKER_THREADS.get(); + + /*BEGIN_HADRON*/ + // ideally we should run at least one compaction task per tenant in order to (1) maximize + // compaction throughput (2) avoid head-of-line blocking of large compactions. However doing + // that may create too many compaction tasks with lots of memory overheads. So we limit the + // number of compaction tasks based on the available CPU core count. + // Need to revisit. + // let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD") + // .ok() + // .and_then(|s| s.parse().ok()) + // .unwrap_or(4); + // let permits = usize::max(1, total_threads * tasks_per_thread); + // // assert!(permits < total_threads, "need threads for other work"); + /*END_HADRON*/ + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); assert_ne!(permits, 0, "we will not be adding in permits later"); assert!(permits < total_threads, "need threads for other work"); @@ -303,9 +318,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - Offload(_) => Level::ERROR, - AlreadyRunning(_) => Level::ERROR, - CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); @@ -315,7 +327,7 @@ pub(crate) fn log_compaction_error( .is_some_and(|e| e.is_stopping()); let timeline = root_cause .downcast_ref::() - .is_some_and(|e| e.is_stopping()); + .is_some_and(|e| e.is_cancel()); let buffered_writer_flush_task_canelled = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6b1e747cb6..4320f3b142 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -40,7 +40,6 @@ use layer_manager::{ Shutdown, }; -use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use pageserver_api::key::{ @@ -78,7 +77,7 @@ use utils::rate_limit::RateLimit; use utils::seqwait::SeqWait; use utils::simple_rcu::{Rcu, RcuReadGuard}; use utils::sync::gate::{Gate, GateGuard}; -use utils::{completion, critical, fs_ext, pausable_failpoint}; +use utils::{completion, critical_timeline, fs_ext, pausable_failpoint}; #[cfg(test)] use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; @@ -106,7 +105,7 @@ use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; -use crate::feature_resolver::FeatureResolver; +use crate::feature_resolver::TenantFeatureResolver; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; use crate::metrics::{ @@ -119,7 +118,6 @@ use crate::pgdatadir_mapping::{ MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; -use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; @@ -202,7 +200,7 @@ pub struct TimelineResources { pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, pub basebackup_cache: Arc, - pub feature_resolver: FeatureResolver, + pub feature_resolver: Arc, } pub struct Timeline { @@ -448,7 +446,7 @@ pub struct Timeline { /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, - feature_resolver: FeatureResolver, + feature_resolver: Arc, } pub(crate) enum PreviousHeatmap { @@ -585,6 +583,28 @@ pub(crate) enum PageReconstructError { MissingKey(Box), } +impl PageReconstructError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + PageReconstructError::Other(_) => false, + PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(), + PageReconstructError::Cancelled => true, + PageReconstructError::WalRedo(_) => false, + PageReconstructError::MissingKey(_) => false, + } + } + #[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + PageReconstructError::Other(e) => e, + PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(), + PageReconstructError::Cancelled => anyhow::Error::new(self), + PageReconstructError::WalRedo(e) => e, + PageReconstructError::MissingKey(_) => anyhow::Error::new(self), + } + } +} + impl From for PageReconstructError { fn from(value: anyhow::Error) -> Self { // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error @@ -738,17 +758,6 @@ impl std::fmt::Display for MissingKeyError { } } -impl PageReconstructError { - /// Returns true if this error indicates a tenant/timeline shutdown alike situation - pub(crate) fn is_stopping(&self) -> bool { - use PageReconstructError::*; - match self { - Cancelled => true, - Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] @@ -951,13 +960,35 @@ pub enum WaitLsnError { Timeout(String), } +impl WaitLsnError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + WaitLsnError::Shutdown => true, + WaitLsnError::BadState(timeline_state) => match timeline_state { + TimelineState::Loading => false, + TimelineState::Active => false, + TimelineState::Stopping => true, + TimelineState::Broken { .. } => false, + }, + WaitLsnError::Timeout(_) => false, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + WaitLsnError::Shutdown => anyhow::Error::new(self), + WaitLsnError::BadState(_) => anyhow::Error::new(self), + WaitLsnError::Timeout(_) => anyhow::Error::new(self), + } + } +} + impl From for tonic::Status { fn from(err: WaitLsnError) -> Self { use tonic::Code; - let code = match &err { - WaitLsnError::Timeout(_) => Code::Internal, - WaitLsnError::BadState(_) => Code::Internal, - WaitLsnError::Shutdown => Code::Unavailable, + let code = if err.is_cancel() { + Code::Unavailable + } else { + Code::Internal }; tonic::Status::new(code, err.to_string()) } @@ -1084,6 +1115,26 @@ enum ImageLayerCreationOutcome { Skip, } +enum RepartitionError { + Other(anyhow::Error), + CollectKeyspace(CollectKeySpaceError), +} + +impl RepartitionError { + fn is_cancel(&self) -> bool { + match self { + RepartitionError::Other(_) => false, + RepartitionError::CollectKeyspace(e) => e.is_cancel(), + } + } + fn into_anyhow(self) -> anyhow::Error { + match self { + RepartitionError::Other(e) => e, + RepartitionError::CollectKeyspace(e) => e.into_anyhow(), + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -1770,30 +1821,31 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN if we are in AttachedSingle and - // not blocked by the lsn lease deadline. + // Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted. + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); + } + + // We allow create lease for those below the planned gc cutoff if we are still within the grace period + // of GC blocking. let validate = { let conf = self.tenant_conf.load(); - conf.location.attach_mode == AttachmentMode::Single - && !conf.is_gc_blocked_by_lsn_lease_deadline() + !conf.is_gc_blocked_by_lsn_lease_deadline() }; - if init || validate { - let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); - if lsn < *latest_gc_cutoff_lsn { - bail!( - "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", - lsn, - *latest_gc_cutoff_lsn - ); - } - if lsn < planned_cutoff { - bail!( - "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", - lsn, - planned_cutoff - ); - } + // Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines + // whether it is a initial lease creation or a renewal. + if (init || validate) && lsn < planned_cutoff { + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } let dt: DateTime = valid_until.into(); @@ -2066,19 +2118,9 @@ impl Timeline { Err(CompactionError::ShuttingDown) => { // Covered by the `Err(e) if e.is_cancel()` branch. } - Err(CompactionError::AlreadyRunning(_)) => { - // Covered by the `Err(e) if e.is_cancel()` branch. - } Err(CompactionError::Other(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } - Err(CompactionError::CollectKeySpaceError(_)) => { - // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. - self.compaction_failed.store(true, AtomicOrdering::Relaxed) - } - // Don't change the current value on offload failure or shutdown. We don't want to - // abruptly stall nor resume L0 flushes in these cases. - Err(CompactionError::Offload(_)) => {} }; result @@ -2142,14 +2184,31 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. + // cancel walreceiver to stop ingesting more data asap. + // + // Note that we're accepting a race condition here where we may + // do the final flush below, before walreceiver observes the + // cancellation and exits. + // This means we may open a new InMemoryLayer after the final flush below. + // Flush loop is also still running for a short while, so, in theory, it + // could also make its way into the upload queue. + // + // If we wait for the shutdown of the walreceiver before moving on to the + // flush, then that would be avoided. But we don't do it because the + // walreceiver entertains reads internally, which means that it possibly + // depends on the download of layers. Layer download is only sensitive to + // the cancellation of the entire timeline, so cancelling the walreceiver + // will have no effect on the individual get requests. + // This would cause problems when there is a lot of ongoing downloads or + // there is S3 unavailabilities, i.e. detach, deletion, etc would hang, + // and we can't deallocate resources of the timeline, etc. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.shutdown().await; + walreceiver.cancel().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); @@ -3108,7 +3167,7 @@ impl Timeline { basebackup_cache: resources.basebackup_cache, - feature_resolver: resources.feature_resolver, + feature_resolver: resources.feature_resolver.clone(), }; result.repartition_threshold = @@ -4725,7 +4784,7 @@ impl Timeline { } // Fetch the next layer to flush, if any. - let (layer, l0_count, frozen_count, frozen_size) = { + let (layer, l0_count, frozen_count, frozen_size, open_layer_size) = { let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await; let Ok(lm) = layers.layer_map() else { info!("dropping out of flush loop for timeline shutdown"); @@ -4738,8 +4797,13 @@ impl Timeline { .iter() .map(|l| l.estimated_in_mem_size()) .sum(); + let open_layer_size: u64 = lm + .open_layer + .as_ref() + .map(|l| l.estimated_in_mem_size()) + .unwrap_or(0); let layer = lm.frozen_layers.front().cloned(); - (layer, l0_count, frozen_count, frozen_size) + (layer, l0_count, frozen_count, frozen_size, open_layer_size) // drop 'layers' lock }; let Some(layer) = layer else { @@ -4752,7 +4816,7 @@ impl Timeline { if l0_count >= stall_threshold { warn!( "stalling layer flushes for compaction backpressure at {l0_count} \ - L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)" ); let stall_timer = self .metrics @@ -4805,7 +4869,7 @@ impl Timeline { let delay = flush_duration.as_secs_f64(); info!( "delaying layer flush by {delay:.3}s for compaction backpressure at \ - {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)" + {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes, {open_layer_size} bytes in open layer)" ); let _delay_timer = self .metrics @@ -4944,7 +5008,7 @@ impl Timeline { ctx, ) .await - .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?; + .map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); @@ -5194,18 +5258,18 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { + ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> { let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called concurrently" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" ))); } @@ -5226,7 +5290,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(RepartitionError::CollectKeyspace)?; let dense_partitioning = dense_ks.partition( &self.shard_identity, partition_size, @@ -5304,6 +5371,7 @@ impl Timeline { ctx: &RequestContext, img_range: Range, io_concurrency: IoConcurrency, + progress: Option<(usize, usize)>, ) -> Result { let mut wrote_keys = false; @@ -5380,11 +5448,15 @@ impl Timeline { } } + let progress_report = progress + .map(|(idx, total)| format!("({idx}/{total}) ")) + .unwrap_or_default(); if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. info!( - "produced image layer for rel {}", + "{} produced image layer for rel {}", + progress_report, ImageLayerName { key_range: img_range.clone(), lsn @@ -5394,7 +5466,12 @@ impl Timeline { unfinished_image_layer: image_layer_writer, }) } else { - tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + tracing::debug!( + "{} no data in range {}-{}", + progress_report, + img_range.start, + img_range.end + ); Ok(ImageLayerCreationOutcome::Empty) } } @@ -5629,7 +5706,8 @@ impl Timeline { } } - for partition in partition_parts.iter() { + let total = partition_parts.len(); + for (idx, partition) in partition_parts.iter().enumerate() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } @@ -5714,6 +5792,7 @@ impl Timeline { ctx, img_range.clone(), io_concurrency, + Some((idx, total)), ) .await? } else { @@ -5979,52 +6058,21 @@ impl Drop for Timeline { pub(crate) enum CompactionError { #[error("The timeline or pageserver is shutting down")] ShuttingDown, - /// Compaction tried to offload a timeline and failed - #[error("Failed to offload timeline: {0}")] - Offload(OffloadError), - /// Compaction cannot be done right now; page reconstruction and so on. - #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), - #[error("Compaction already running: {0}")] - AlreadyRunning(&'static str), } impl CompactionError { /// Errors that can be ignored, i.e., cancel and shutdown. pub fn is_cancel(&self) -> bool { - matches!( - self, + matches!(self, Self::ShuttingDown) + } + + pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { + if err.is_cancel() { Self::ShuttingDown - | Self::AlreadyRunning(_) - | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) - | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( - PageReconstructError::Cancelled - )) - | Self::Offload(OffloadError::Cancelled) - ) - } - - /// Critical errors that indicate data corruption. - pub fn is_critical(&self) -> bool { - matches!( - self, - Self::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ) - ) - ) - } -} - -impl From for CompactionError { - fn from(e: OffloadError) -> Self { - match e { - OffloadError::Cancelled => Self::ShuttingDown, - _ => Self::Offload(e), + } else { + Self::Other(err.into_anyhow()) } } } @@ -6702,7 +6750,7 @@ impl Timeline { } /// Reconstruct a value, using the given base image and WAL records in 'data'. - async fn reconstruct_value( + pub(crate) async fn reconstruct_value( &self, key: Key, request_lsn: Lsn, @@ -6778,7 +6826,11 @@ impl Timeline { Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { if fire_critical_error { - critical!("walredo failure during page reconstruction: {err:?}"); + critical_timeline!( + self.tenant_shard_id, + self.timeline_id, + "walredo failure during page reconstruction: {err:?}" + ); } return Err(PageReconstructError::WalRedo( err.context("reconstruct a page image"), diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a7bd89a326..e5ce733663 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,14 +9,15 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use std::time::{Duration, Instant}; -use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard}; +use super::layer_manager::LayerManagerLockHolder; use super::{ CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; -use crate::tenant::timeline::DeltaEntry; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::tenant::timeline::{DeltaEntry, RepartitionError}; use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; @@ -36,7 +37,7 @@ use serde::Serialize; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, trace, warn}; -use utils::critical; +use utils::critical_timeline; use utils::id::TimelineId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; @@ -64,7 +65,7 @@ use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_layer_manager_rlock, }; -use crate::tenant::{DeltaLayer, MaybeOffloaded}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. @@ -101,7 +102,11 @@ pub enum GcCompactionQueueItem { /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) auto: bool, }, - SubCompactionJob(CompactOptions), + SubCompactionJob { + i: usize, + total: usize, + options: CompactOptions, + }, Notify(GcCompactionJobId, Option), } @@ -163,7 +168,7 @@ impl GcCompactionQueueItem { running, job_id: id.0, }), - GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse { + GcCompactionQueueItem::SubCompactionJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, @@ -489,7 +494,7 @@ impl GcCompactionQueue { .map(|job| job.compact_lsn_range.end) .max() .unwrap(); - for job in jobs { + for (i, job) in jobs.into_iter().enumerate() { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. let mut flags: EnumSet = EnumSet::default(); @@ -507,7 +512,11 @@ impl GcCompactionQueue { compact_lsn_range: Some(job.compact_lsn_range.into()), sub_compaction_max_job_size_mb: None, }; - pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); + pending_tasks.push(GcCompactionQueueItem::SubCompactionJob { + options, + i, + total: jobs_len, + }); } if !auto { @@ -564,7 +573,7 @@ impl GcCompactionQueue { match res { Ok(res) => Ok(res), Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(_) => { + Err(CompactionError::Other(_)) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -583,9 +592,9 @@ impl GcCompactionQueue { timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { - return Err(CompactionError::AlreadyRunning( - "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", - )); + return Err(CompactionError::Other(anyhow::anyhow!( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue." + ))); }; let has_pending_tasks; let mut yield_for_l0 = false; @@ -651,7 +660,7 @@ impl GcCompactionQueue { } } } - GcCompactionQueueItem::SubCompactionJob(options) => { + GcCompactionQueueItem::SubCompactionJob { options, i, total } => { // TODO: error handling, clear the queue if any task fails? let _gc_guard = match gc_block.start().await { Ok(guard) => guard, @@ -663,6 +672,7 @@ impl GcCompactionQueue { ))); } }; + info!("running gc-compaction subcompaction job {}/{}", i, total); let res = timeline.compact_with_options(cancel, options, ctx).await; let compaction_result = match res { Ok(res) => res, @@ -1310,7 +1320,7 @@ impl Timeline { || cfg!(feature = "testing") || self .feature_resolver - .evaluate_boolean("image-compaction-boundary", self.tenant_shard_id.tenant_id) + .evaluate_boolean("image-compaction-boundary") .is_ok() { let last_repartition_lsn = self.partitioning.read().1; @@ -1381,7 +1391,11 @@ impl Timeline { GetVectoredError::MissingKey(_), ) = err { - critical!("missing key during compaction: {err:?}"); + critical_timeline!( + self.tenant_shard_id, + self.timeline_id, + "missing key during compaction: {err:?}" + ); } })?; @@ -1404,18 +1418,33 @@ impl Timeline { } // Suppress errors when cancelled. - Err(_) if self.cancel.is_cancelled() => {} - Err(err) if err.is_cancel() => {} - - // Alert on critical errors that indicate data corruption. - Err(err) if err.is_critical() => { - critical!("could not compact, repartitioning keyspace failed: {err:?}"); - } - - // Log other errors. No partitioning? This is normal, if the timeline was just created + // + // Log other errors but continue. Failure to repartition is normal, if the timeline was just created // as an empty timeline. Also in unit tests, when we use the timeline as a simple // key-value store, ignoring the datadir layout. Log the error but continue. - Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), + // + // TODO: + // 1. shouldn't we return early here if we observe cancellation + // 2. Experiment: can we stop checking self.cancel here? + Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch + Err(err) if err.is_cancel() => {} + Err(RepartitionError::CollectKeyspace( + e @ CollectKeySpaceError::Decode(_) + | e @ CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + )) => { + // Alert on critical errors that indicate data corruption. + critical_timeline!( + self.tenant_shard_id, + self.timeline_id, + "could not compact, repartitioning keyspace failed: {e:?}" + ); + } + Err(e) => error!( + "could not compact, repartitioning keyspace failed: {:?}", + e.into_anyhow() + ), }; let partition_count = self.partitioning.read().0.0.parts.len(); @@ -1591,13 +1620,15 @@ impl Timeline { let started = Instant::now(); let mut replace_image_layers = Vec::new(); + let total = layers_to_rewrite.len(); - for layer in layers_to_rewrite { + for (i, layer) in layers_to_rewrite.into_iter().enumerate() { if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } - info!(layer=%layer, "rewriting layer after shard split"); + info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total); + let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -1779,20 +1810,14 @@ impl Timeline { } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { + let stats = CompactLevel0Phase1StatsBuilder { version: Some(2), tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); self.compact_level0_phase1( - phase1_layers_locked, stats, target_file_size, force_compaction_ignore_threshold, @@ -1813,16 +1838,19 @@ impl Timeline { } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1<'a>( - self: &'a Arc, - guard: LayerManagerReadGuard<'a>, + async fn compact_level0_phase1( + self: &Arc, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller + let begin = tokio::time::Instant::now(); + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + let layers = guard.layer_map()?; let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); @@ -1857,6 +1885,12 @@ impl Timeline { .map(|x| guard.get_from_desc(x)) .collect::>(); + drop_layer_manager_rlock(guard); + + // The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated + // by the time we finish the compaction. So we need to get it here. + let l0_last_record_lsn = self.get_last_record_lsn(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -1944,9 +1978,7 @@ impl Timeline { // we don't accidentally use it later in the function. drop(level0_deltas); - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); + stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now(); // TODO: replace with streaming k-merge let all_keys = { @@ -1968,7 +2000,7 @@ impl Timeline { all_keys }; - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now(); // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. // @@ -2002,7 +2034,6 @@ impl Timeline { } } let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? // min-heap (reserve space for one more element added before eviction) @@ -2021,8 +2052,12 @@ impl Timeline { // has not so much sense, because largest holes will corresponds field1/field2 changes. // But we are mostly interested to eliminate holes which cause generation of excessive image layers. // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = - layers.image_coverage(&key_range, last_record_lsn).len(); + let coverage_size = { + // TODO: optimize this with copy-on-write layer map. + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let layers = guard.layer_map()?; + layers.image_coverage(&key_range, l0_last_record_lsn).len() + }; if coverage_size >= min_hole_coverage_size { heap.push(Hole { key_range, @@ -2041,7 +2076,6 @@ impl Timeline { holes }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_layer_manager_rlock(guard); if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); @@ -2382,9 +2416,8 @@ struct CompactLevel0Phase1StatsBuilder { tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, + compaction_prerequisites_micros: DurationRecorder, read_lock_held_compute_holes_micros: DurationRecorder, read_lock_drop_micros: DurationRecorder, write_layer_files_micros: DurationRecorder, @@ -2399,9 +2432,8 @@ struct CompactLevel0Phase1Stats { tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, + compaction_prerequisites_micros: RecordedDuration, read_lock_held_compute_holes_micros: RecordedDuration, read_lock_drop_micros: RecordedDuration, write_layer_files_micros: RecordedDuration, @@ -2426,16 +2458,12 @@ impl TryFrom for CompactLevel0Phase1Stats { .read_lock_acquisition_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, read_lock_held_key_sort_micros: value .read_lock_held_key_sort_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros + compaction_prerequisites_micros: value + .compaction_prerequisites_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, read_lock_held_compute_holes_micros: value @@ -2502,7 +2530,10 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + let (dense_ks, _sparse_ks) = self + .collect_keyspace(end_lsn, ctx) + .await + .map_err(CompactionError::from_collect_keyspace)?; // TODO(chi): ignore sparse_keyspace for now, compact it in the future. let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); @@ -4343,6 +4374,7 @@ impl TimelineAdaptor { ctx, key_range.clone(), IoConcurrency::sequential(), + None, ) .await?; diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index f20a1343df..223e888e27 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -182,6 +182,7 @@ pub(crate) async fn generate_tombstone_image_layer( detached: &Arc, ancestor: &Arc, ancestor_lsn: Lsn, + historic_layers_to_copy: &Vec, ctx: &RequestContext, ) -> Result, Error> { tracing::info!( @@ -199,6 +200,20 @@ pub(crate) async fn generate_tombstone_image_layer( let image_lsn = ancestor_lsn; { + for layer in historic_layers_to_copy { + let desc = layer.layer_desc(); + if !desc.is_delta + && desc.lsn_range.start == image_lsn + && overlaps_with(&key_range, &desc.key_range) + { + tracing::info!( + layer=%layer, "will copy tombstone from ancestor instead of creating a new one" + ); + + return Ok(None); + } + } + let layers = detached .layers .read(LayerManagerLockHolder::DetachAncestor) @@ -450,7 +465,8 @@ pub(super) async fn prepare( Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1); if let Some(tombstone_layer) = - generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await? + generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, &rest_of_historic, ctx) + .await? { new_layers.push(tombstone_layer.into()); } diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 2dbff20ab2..33c97287c0 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -212,8 +212,12 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::{HashMap, hash_map}; -use std::sync::{Arc, Mutex, Weak}; +use std::collections::HashMap; +use std::collections::hash_map; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; +use std::time::Duration; use pageserver_api::shard::ShardIdentity; use tracing::{instrument, trace}; @@ -333,6 +337,44 @@ enum RoutingResult { } impl Cache { + /* BEGIN_HADRON */ + /// A wrapper of do_get to resolve the tenant shard for a get page request. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + const GET_MAX_RETRIES: usize = 10; + const RETRY_BACKOFF: Duration = Duration::from_millis(100); + let mut attempt = 0; + loop { + attempt += 1; + match self + .do_get(timeline_id, shard_selector, tenant_manager) + .await + { + Ok(handle) => return Ok(handle), + Err(e) => { + // Retry on tenant manager error to handle tenant split more gracefully + if attempt < GET_MAX_RETRIES { + tracing::warn!( + "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...", + attempt, + e + ); + tokio::time::sleep(RETRY_BACKOFF).await; + continue; + } else { + return Err(e); + } + } + } + } + } + /* END_HADRON */ + /// See module-level comment for details. /// /// Does NOT check for the shutdown state of [`Types::Timeline`]. @@ -341,7 +383,7 @@ impl Cache { /// and if so, return an error that causes the page service to /// close the connection. #[instrument(level = "trace", skip_all)] - pub(crate) async fn get( + async fn do_get( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, @@ -879,6 +921,7 @@ mod tests { .await .err() .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); cache diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 5920315917..e9cf2e9aa7 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -17,8 +17,8 @@ pub(crate) enum OffloadError { Cancelled, #[error("Timeline is not archived")] NotArchived, - #[error(transparent)] - RemoteStorage(anyhow::Error), + #[error("Offload or deletion already in progress")] + AlreadyInProgress, #[error("Unexpected offload error: {0}")] Other(anyhow::Error), } @@ -27,7 +27,7 @@ impl From for OffloadError { fn from(e: TenantManifestError) -> Self { match e { TenantManifestError::Cancelled => Self::Cancelled, - TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e), + TenantManifestError::RemoteStorage(e) => Self::Other(e), } } } @@ -44,20 +44,26 @@ pub(crate) async fn offload_timeline( timeline.timeline_id, TimelineDeleteGuardKind::Offload, ); - if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { - let is_archived = timeline.is_archived(); - if is_archived == Some(true) { - tracing::error!("timeline is archived but has non-archived children: {children:?}"); + let (timeline, guard) = match delete_guard_res { + Ok(timeline_and_guard) => timeline_and_guard, + Err(DeleteTimelineError::HasChildren(children)) => { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); return Err(OffloadError::NotArchived); } - tracing::info!( - ?is_archived, - "timeline is not archived and has unarchived children" - ); - return Err(OffloadError::NotArchived); + Err(DeleteTimelineError::AlreadyInProgress(_)) => { + tracing::info!("timeline offload or deletion already in progress"); + return Err(OffloadError::AlreadyInProgress); + } + Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))), }; - let (timeline, guard) = - delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 633c94a010..3fe6c21a7d 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,7 +63,6 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, - task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -80,7 +79,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - let task = WALRECEIVER_RUNTIME.spawn({ + let _task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -121,25 +120,14 @@ impl WalReceiver { Self { manager_status, cancel, - task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub async fn shutdown(self) { + pub async fn cancel(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); - match self.task.await { - Ok(()) => debug!("Shutdown success"), - Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { - // already logged by panic hook - } - Err(je) => { - error!("shutdown walreceiver task join error: {je}") - } - } } pub(crate) fn status(&self) -> Option { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 9b3cfb200a..fe669d2d74 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -100,6 +100,7 @@ pub(super) async fn connection_manager_loop_step( // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30)); debug!("Subscribed for broker timeline updates"); loop { @@ -156,7 +157,10 @@ pub(super) async fn connection_manager_loop_step( // Got a new update from the broker broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => { match broker_update { - Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), + Ok(Some(broker_update)) => { + broker_reset_interval.reset(); + connection_manager_state.register_timeline_update(broker_update); + }, Err(status) => { match status.code() { Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => { @@ -178,6 +182,21 @@ pub(super) async fn connection_manager_loop_step( } }, + // If we've not received any updates from the broker from a while, are waiting for WAL + // and have no safekeeper connection or connection candidates, then it might be that + // the broker subscription is wedged. Drop the currrent subscription and re-subscribe + // with the goal of unblocking it. + _ = broker_reset_interval.tick() => { + let awaiting_lsn = wait_lsn_status.borrow().is_some(); + let no_candidates = connection_manager_state.wal_stream_candidates.is_empty(); + let no_connection = connection_manager_state.wal_connection.is_none(); + + if awaiting_lsn && no_candidates && no_connection { + tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); + broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + } + }, + new_event = async { // Reminder: this match arm needs to be cancellation-safe. loop { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 6d52da1f00..f619c69599 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -25,7 +25,7 @@ use tokio_postgres::replication::ReplicationStream; use tokio_postgres::{Client, SimpleQueryMessage, SimpleQueryRow}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, trace, warn}; -use utils::critical; +use utils::critical_timeline; use utils::id::NodeId; use utils::lsn::Lsn; use utils::pageserver_feedback::PageserverFeedback; @@ -275,20 +275,12 @@ pub(super) async fn handle_walreceiver_connection( let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); - let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx); - let walingest_res = select! { - walingest_res = walingest_future => walingest_res, - _ = cancellation.cancelled() => { - // We are doing reads in WalIngest::new, and those can hang as they come from the network. - // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one. - debug!("Connection cancelled"); - return Err(WalReceiverError::Cancelled); - }, - }; - let mut walingest = walingest_res.map_err(|e| match e.kind { - crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, - _ => WalReceiverError::Other(e.into()), - })?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) + .await + .map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { @@ -368,9 +360,13 @@ pub(super) async fn handle_walreceiver_connection( match raw_wal_start_lsn.cmp(&expected_wal_start) { std::cmp::Ordering::Greater => { let msg = format!( - "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})" + "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn}" + ); + critical_timeline!( + timeline.tenant_shard_id, + timeline.timeline_id, + "{msg}" ); - critical!("{msg}"); return Err(WalReceiverError::Other(anyhow!(msg))); } std::cmp::Ordering::Less => { @@ -383,7 +379,11 @@ pub(super) async fn handle_walreceiver_connection( "Received record with next_record_lsn multiple times ({} < {})", first_rec.next_record_lsn, expected_wal_start ); - critical!("{msg}"); + critical_timeline!( + timeline.tenant_shard_id, + timeline.timeline_id, + "{msg}" + ); return Err(WalReceiverError::Other(anyhow!(msg))); } } @@ -452,7 +452,11 @@ pub(super) async fn handle_walreceiver_connection( // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. if !cancellation.is_cancelled() && !timeline.is_stopping() { - critical!("{err:?}") + critical_timeline!( + timeline.tenant_shard_id, + timeline.timeline_id, + "{err:?}" + ); } })?; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 29d1a31aaf..ccfad7a391 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -45,9 +45,10 @@ pub(crate) fn regenerate( let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?; // Fetch the fraction of disk space which may be used - let disk_usable_pct = match conf.disk_usage_based_eviction.clone() { - Some(e) => e.max_usage_pct, - None => Percent::new(100).unwrap(), + let disk_usable_pct = if conf.disk_usage_based_eviction.enabled { + conf.disk_usage_based_eviction.max_usage_pct + } else { + Percent::new(100).unwrap() }; // Express a static value for how many shards we may schedule on one node diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index a597aedee3..f852051178 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -40,7 +40,7 @@ use tracing::*; use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; -use utils::{critical, failpoint_support}; +use utils::{critical_timeline, failpoint_support}; use wal_decoder::models::record::NeonWalRecord; use wal_decoder::models::*; @@ -418,18 +418,30 @@ impl WalIngest { // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See: // https://github.com/neondatabase/neon/pull/10634. let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else { - critical!("clear_vm_bits for unknown VM relation {vm_rel}"); + critical_timeline!( + modification.tline.tenant_shard_id, + modification.tline.timeline_id, + "clear_vm_bits for unknown VM relation {vm_rel}" + ); return Ok(()); }; if let Some(blknum) = new_vm_blk { if blknum >= vm_size { - critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); + critical_timeline!( + modification.tline.tenant_shard_id, + modification.tline.timeline_id, + "new_vm_blk {blknum} not in {vm_rel} of size {vm_size}" + ); new_vm_blk = None; } } if let Some(blknum) = old_vm_blk { if blknum >= vm_size { - critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"); + critical_timeline!( + modification.tline.tenant_shard_id, + modification.tline.timeline_id, + "old_vm_blk {blknum} not in {vm_rel} of size {vm_size}" + ); old_vm_blk = None; } } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b17b5a15f9..c6d3cafe9a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -566,22 +566,55 @@ impl PostgresRedoManager { } } +#[cfg(test)] +pub(crate) mod harness { + use super::PostgresRedoManager; + use crate::config::PageServerConf; + use utils::{id::TenantId, shard::TenantShardId}; + + pub struct RedoHarness { + // underscored because unused, except for removal at drop + _repo_dir: camino_tempfile::Utf8TempDir, + pub manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, + } + + impl RedoHarness { + pub fn new() -> anyhow::Result { + crate::tenant::harness::setup_logging(); + + let repo_dir = camino_tempfile::tempdir()?; + let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let conf = Box::leak(Box::new(conf)); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + + Ok(RedoHarness { + _repo_dir: repo_dir, + manager, + tenant_shard_id, + }) + } + pub fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + } +} + #[cfg(test)] mod tests { use std::str::FromStr; use bytes::Bytes; use pageserver_api::key::Key; - use pageserver_api::shard::TenantShardId; use postgres_ffi::PgMajorVersion; use tracing::Instrument; - use utils::id::TenantId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; - use super::PostgresRedoManager; - use crate::config::PageServerConf; use crate::walredo::RedoAttemptType; + use crate::walredo::harness::RedoHarness; #[tokio::test] async fn test_ping() { @@ -692,33 +725,4 @@ mod tests { ) ] } - - struct RedoHarness { - // underscored because unused, except for removal at drop - _repo_dir: camino_tempfile::Utf8TempDir, - manager: PostgresRedoManager, - tenant_shard_id: TenantShardId, - } - - impl RedoHarness { - fn new() -> anyhow::Result { - crate::tenant::harness::setup_logging(); - - let repo_dir = camino_tempfile::tempdir()?; - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); - let conf = Box::leak(Box::new(conf)); - let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - Ok(RedoHarness { - _repo_dir: repo_dir, - manager, - tenant_shard_id, - }) - } - fn span(&self) -> tracing::Span { - tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) - } - } } diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 9bce0e798a..bf7aeb4108 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -22,7 +22,8 @@ OBJS = \ walproposer.o \ walproposer_pg.o \ neon_ddl_handler.o \ - walsender_hooks.o + walsender_hooks.o \ + $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) @@ -54,6 +55,17 @@ WALPROP_OBJS = \ neon_utils.o \ walproposer_compat.o +# libcommunicator.a is built by cargo from the Rust sources under communicator/ +# subdirectory. `cargo build` also generates communicator_bindings.h. +neon.o: communicator/communicator_bindings.h + +$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &: + (cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)) + +# Force `cargo build` every time. Some of the Rust sources might have +# changed. +.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h + .PHONY: walproposer-lib walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB walproposer-lib: libwalproposer.a; diff --git a/pgxn/neon/communicator/.gitignore b/pgxn/neon/communicator/.gitignore new file mode 100644 index 0000000000..d713be0a35 --- /dev/null +++ b/pgxn/neon/communicator/.gitignore @@ -0,0 +1,2 @@ +# generated file (with cbindgen, see build.rs) +communicator_bindings.h diff --git a/pgxn/neon/communicator/Cargo.toml b/pgxn/neon/communicator/Cargo.toml new file mode 100644 index 0000000000..e95a269d90 --- /dev/null +++ b/pgxn/neon/communicator/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "communicator" +version = "0.1.0" +license.workspace = true +edition.workspace = true + +[lib] +crate-type = ["staticlib"] + +[features] +# 'testing' feature is currently unused in the communicator, but we accept it for convenience of +# calling build scripts, so that you can pass the same feature to all packages. +testing = [] + +[dependencies] +neon-shmem.workspace = true +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } + +[build-dependencies] +cbindgen.workspace = true diff --git a/pgxn/neon/communicator/README.md b/pgxn/neon/communicator/README.md new file mode 100644 index 0000000000..8169ae72b5 --- /dev/null +++ b/pgxn/neon/communicator/README.md @@ -0,0 +1,8 @@ +This package will evolve into a "compute-pageserver communicator" +process and machinery. For now, it's just a dummy that doesn't do +anything interesting, but it allows us to test the compilation and +linking of Rust code into the Postgres extensions. + +At compilation time, pgxn/neon/communicator/ produces a static +library, libcommunicator.a. It is linked to the neon.so extension +library. diff --git a/pgxn/neon/communicator/build.rs b/pgxn/neon/communicator/build.rs new file mode 100644 index 0000000000..2b83b4238d --- /dev/null +++ b/pgxn/neon/communicator/build.rs @@ -0,0 +1,20 @@ +use std::env; + +fn main() -> Result<(), Box> { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + + match cbindgen::generate(crate_dir) { + Ok(bindings) => { + bindings.write_to_file("communicator_bindings.h"); + } + Err(cbindgen::Error::ParseSyntaxError { .. }) => { + // This means there was a syntax error in the Rust sources. Don't panic, because + // we want the build to continue and the Rust compiler to hit the error. The + // Rust compiler produces a better error message than cbindgen. + eprintln!("Generating C bindings failed because of a Rust syntax error"); + } + Err(err) => panic!("Unable to generate C bindings: {err:?}"), + }; + + Ok(()) +} diff --git a/pgxn/neon/communicator/cbindgen.toml b/pgxn/neon/communicator/cbindgen.toml new file mode 100644 index 0000000000..72e0c8174a --- /dev/null +++ b/pgxn/neon/communicator/cbindgen.toml @@ -0,0 +1,4 @@ +language = "C" + +[enum] +prefix_with_name = true diff --git a/pgxn/neon/communicator/src/lib.rs b/pgxn/neon/communicator/src/lib.rs new file mode 100644 index 0000000000..24c180d37d --- /dev/null +++ b/pgxn/neon/communicator/src/lib.rs @@ -0,0 +1,6 @@ +/// dummy function, just to test linking Rust functions into the C +/// extension +#[unsafe(no_mangle)] +pub extern "C" fn communicator_dummy(arg: u32) -> u32 { + arg + 1 +} diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 8a405f4129..9e0ca16fed 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -43,6 +43,9 @@ #include "storage/ipc.h" #endif +/* the rust bindings, generated by cbindgen */ +#include "communicator/communicator_bindings.h" + PG_MODULE_MAGIC; void _PG_init(void); @@ -87,6 +90,14 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = { {NULL, 0, false} }; +static const struct config_enum_entry debug_compare_local_modes[] = { + {"none", DEBUG_COMPARE_LOCAL_NONE, false}, + {"prefetch", DEBUG_COMPARE_LOCAL_PREFETCH, false}, + {"lfc", DEBUG_COMPARE_LOCAL_LFC, false}, + {"all", DEBUG_COMPARE_LOCAL_ALL, false}, + {NULL, 0, false} +}; + /* * XXX: These private to procarray.c, but we need them here. */ @@ -444,6 +455,9 @@ _PG_init(void) shmem_startup_hook = neon_shmem_startup_hook; #endif + /* dummy call to a Rust function in the communicator library, to check that it works */ + (void) communicator_dummy(123); + pg_init_libpagestore(); lfc_init(); pg_init_walproposer(); @@ -519,6 +533,16 @@ _PG_init(void) GUC_UNIT_KB, NULL, NULL, NULL); + DefineCustomEnumVariable( + "neon.debug_compare_local", + "Debug mode for compaing content of pages in prefetch ring/LFC/PS and local disk", + NULL, + &debug_compare_local, + DEBUG_COMPARE_LOCAL_NONE, + debug_compare_local_modes, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c index dba28c0ed6..2ce7b0086b 100644 --- a/pgxn/neon/neon_ddl_handler.c +++ b/pgxn/neon/neon_ddl_handler.c @@ -98,12 +98,14 @@ typedef struct typedef struct DdlHashTable { struct DdlHashTable *prev_table; + size_t subtrans_level; HTAB *db_table; HTAB *role_table; } DdlHashTable; static DdlHashTable RootTable; static DdlHashTable *CurrentDdlTable = &RootTable; +static int SubtransLevel; /* current nesting level of subtransactions */ static void PushKeyValue(JsonbParseState **state, char *key, char *value) @@ -332,9 +334,25 @@ SendDeltasToControlPlane() } } +static void +InitCurrentDdlTableIfNeeded() +{ + /* Lazy construction of DllHashTable chain */ + if (SubtransLevel > CurrentDdlTable->subtrans_level) + { + DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable)); + new_table->prev_table = CurrentDdlTable; + new_table->subtrans_level = SubtransLevel; + new_table->role_table = NULL; + new_table->db_table = NULL; + CurrentDdlTable = new_table; + } +} + static void InitDbTableIfNeeded() { + InitCurrentDdlTableIfNeeded(); if (!CurrentDdlTable->db_table) { HASHCTL db_ctl = {}; @@ -353,6 +371,7 @@ InitDbTableIfNeeded() static void InitRoleTableIfNeeded() { + InitCurrentDdlTableIfNeeded(); if (!CurrentDdlTable->role_table) { HASHCTL role_ctl = {}; @@ -371,19 +390,21 @@ InitRoleTableIfNeeded() static void PushTable() { - DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable)); - - new_table->prev_table = CurrentDdlTable; - new_table->role_table = NULL; - new_table->db_table = NULL; - CurrentDdlTable = new_table; + SubtransLevel += 1; } static void MergeTable() { - DdlHashTable *old_table = CurrentDdlTable; + DdlHashTable *old_table; + Assert(SubtransLevel >= CurrentDdlTable->subtrans_level); + if (--SubtransLevel >= CurrentDdlTable->subtrans_level) + { + return; + } + + old_table = CurrentDdlTable; CurrentDdlTable = old_table->prev_table; if (old_table->db_table) @@ -476,11 +497,15 @@ MergeTable() static void PopTable() { - /* - * Current table gets freed because it is allocated in aborted - * subtransaction's memory context. - */ - CurrentDdlTable = CurrentDdlTable->prev_table; + Assert(SubtransLevel >= CurrentDdlTable->subtrans_level); + if (--SubtransLevel < CurrentDdlTable->subtrans_level) + { + /* + * Current table gets freed because it is allocated in aborted + * subtransaction's memory context. + */ + CurrentDdlTable = CurrentDdlTable->prev_table; + } } static void diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 9df202290d..4470d3a94d 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -177,6 +177,22 @@ extern StringInfoData nm_pack_request(NeonRequest *msg); extern NeonResponse *nm_unpack_response(StringInfo s); extern char *nm_to_string(NeonMessage *msg); +/* + * If debug_compare_local>DEBUG_COMPARE_LOCAL_NONE, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +typedef enum +{ + DEBUG_COMPARE_LOCAL_NONE, /* normal mode - pages are storted locally only for unlogged relations */ + DEBUG_COMPARE_LOCAL_PREFETCH, /* if page is found in prefetch ring, then compare it with local and return */ + DEBUG_COMPARE_LOCAL_LFC, /* if page is found in LFC or prefetch ring, then compare it with local and return */ + DEBUG_COMPARE_LOCAL_ALL /* always fetch page from PS and compare it with local */ +} DebugCompareLocalMode; + +extern int debug_compare_local; + /* * API */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 43fd715bbb..9d25266e10 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -76,21 +76,11 @@ typedef PGAlignedBlock PGIOAlignedBlock; #endif -/* - * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API - * calls to md.c, and *also* do the calls to the Page Server. On every - * read, compare the versions we read from local disk and Page Server, - * and Assert that they are identical. - */ -/* #define DEBUG_COMPARE_LOCAL */ - -#ifdef DEBUG_COMPARE_LOCAL #include "access/nbtree.h" #include "storage/bufpage.h" #include "access/xlog_internal.h" static char *hexdump_page(char *page); -#endif #define IS_LOCAL_REL(reln) (\ NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \ @@ -108,6 +98,8 @@ typedef enum UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; +int debug_compare_local; + static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; @@ -478,9 +470,10 @@ neon_init(void) old_redo_read_buffer_filter = redo_read_buffer_filter; redo_read_buffer_filter = neon_redo_read_buffer_filter; -#ifdef DEBUG_COMPARE_LOCAL - mdinit(); -#endif + if (debug_compare_local) + { + mdinit(); + } } /* @@ -803,13 +796,16 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: -#ifdef DEBUG_COMPARE_LOCAL - mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo); - if (forkNum == MAIN_FORKNUM) - mdcreate(reln, INIT_FORKNUM, true); -#else - mdcreate(reln, forkNum, isRedo); -#endif + if (debug_compare_local) + { + mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo); + if (forkNum == MAIN_FORKNUM) + mdcreate(reln, INIT_FORKNUM, true); + } + else + { + mdcreate(reln, forkNum, isRedo); + } return; default: @@ -848,10 +844,11 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) else set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdcreate(reln, forkNum, isRedo); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); + } } /* @@ -877,7 +874,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged - * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to + * or permanent relation (or if debug_compare_local is set). Try to * unlink, it won't do any harm if the file doesn't exist. */ mdunlink(rinfo, forkNum, isRedo); @@ -973,10 +970,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdextend(reln, forkNum, blkno, buffer, skipFsync); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); + } /* * smgr_extend is often called with an all-zeroes page, so @@ -1051,10 +1049,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + } /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) @@ -1265,10 +1264,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdwriteback(reln, forknum, blocknum, nblocks); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); + } } /* @@ -1282,7 +1282,6 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } -#ifdef DEBUG_COMPARE_LOCAL static void compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) { @@ -1364,7 +1363,6 @@ compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, voi } } } -#endif #if PG_MAJORVERSION_NUM < 17 @@ -1417,22 +1415,28 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) { /* Prefetch hit */ -#ifdef DEBUG_COMPARE_LOCAL - compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); -#else - return; -#endif + if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH) + { + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); + } + if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH) + { + return; + } } /* Try to read from local file cache */ if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { MyNeonCounters->file_cache_hits_total++; -#ifdef DEBUG_COMPARE_LOCAL - compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); -#else - return; -#endif + if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC) + { + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); + } + if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC) + { + return; + } } neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); @@ -1442,15 +1446,15 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer */ communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); -#endif + if (debug_compare_local) + { + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); + } } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 -#ifdef DEBUG_COMPARE_LOCAL static void compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) { @@ -1465,7 +1469,6 @@ compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, vo } } } -#endif static void @@ -1516,13 +1519,19 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, request_lsns, nblocks, buffers, read_pages); -#ifdef DEBUG_COMPARE_LOCAL - compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); - memset(read_pages, 0, sizeof(read_pages)); -#else - if (prefetch_result == nblocks) + if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH) + { + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + } + if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks) + { return; -#endif + } + if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH) + { + memset(read_pages, 0, sizeof(read_pages)); + } + /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, @@ -1531,14 +1540,19 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; -#ifdef DEBUG_COMPARE_LOCAL - compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); - memset(read_pages, 0, sizeof(read_pages)); -#else - /* Read all blocks from LFC, so we're done */ - if (prefetch_result + lfc_result == nblocks) + if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC) + { + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + } + if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks) + { + /* Read all blocks from LFC, so we're done */ return; -#endif + } + if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC) + { + memset(read_pages, 0, sizeof(read_pages)); + } communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); @@ -1548,14 +1562,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - memset(read_pages, 0xFF, sizeof(read_pages)); - compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); -#endif + if (debug_compare_local) + { + memset(read_pages, 0xFF, sizeof(read_pages)); + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + } } #endif -#ifdef DEBUG_COMPARE_LOCAL static char * hexdump_page(char *page) { @@ -1574,7 +1588,6 @@ hexdump_page(char *page) return result.data; } -#endif #if PG_MAJORVERSION_NUM < 17 /* @@ -1596,12 +1609,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo switch (reln->smgr_relpersistence) { case 0: -#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ - if (mdexists(reln, forknum)) -#else - if (mdexists(reln, INIT_FORKNUM)) -#endif + if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) { /* It exists locally. Guess it's unlogged then. */ #if PG_MAJORVERSION_NUM >= 17 @@ -1656,14 +1665,17 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + { #if PG_MAJORVERSION_NUM >= 17 - mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #else - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif -#endif + } + } } #endif @@ -1677,12 +1689,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, switch (reln->smgr_relpersistence) { case 0: -#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ - if (mdexists(reln, forknum)) -#else - if (mdexists(reln, INIT_FORKNUM)) -#endif + if (mdexists(reln, debug_compare_local ? INIT_FORKNUM : forknum)) { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); @@ -1720,10 +1728,11 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + } } #endif @@ -1862,10 +1871,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo */ neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdtruncate(reln, forknum, old_blocks, nblocks); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, old_blocks, nblocks); + } } /* @@ -1904,10 +1914,11 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) communicator_prefetch_pump_state(); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdimmedsync(reln, forknum); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); + } } #if PG_MAJORVERSION_NUM >= 17 @@ -1934,10 +1945,11 @@ neon_registersync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdimmedsync(reln, forknum); -#endif + if (debug_compare_local) + { + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); + } } #endif @@ -1978,10 +1990,11 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_UNLOGGED: unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; -#ifdef DEBUG_COMPARE_LOCAL - if (!IsParallelWorker()) - mdcreate(reln, INIT_FORKNUM, true); -#endif + if (debug_compare_local) + { + if (!IsParallelWorker()) + mdcreate(reln, INIT_FORKNUM, true); + } return; default: @@ -2009,11 +2022,7 @@ neon_start_unlogged_build(SMgrRelation reln) */ if (!IsParallelWorker()) { -#ifndef DEBUG_COMPARE_LOCAL - mdcreate(reln, MAIN_FORKNUM, false); -#else - mdcreate(reln, INIT_FORKNUM, true); -#endif + mdcreate(reln, debug_compare_local ? INIT_FORKNUM : MAIN_FORKNUM, false); } } @@ -2107,14 +2116,14 @@ neon_end_unlogged_build(SMgrRelation reln) lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); mdclose(reln, forknum); -#ifndef DEBUG_COMPARE_LOCAL - /* use isRedo == true, so that we drop it immediately */ - mdunlink(rinfob, forknum, true); -#endif + if (!debug_compare_local) + { + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rinfob, forknum, true); + } } -#ifdef DEBUG_COMPARE_LOCAL - mdunlink(rinfob, INIT_FORKNUM, true); -#endif + if (debug_compare_local) + mdunlink(rinfob, INIT_FORKNUM, true); } NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; diff --git a/proxy/README.md b/proxy/README.md index 583db36f28..e10ff3d710 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -138,3 +138,62 @@ Now from client you can start a new session: ```sh PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` + +## auth broker setup: + +Create a postgres instance: +```sh +docker run \ + --detach \ + --name proxy-postgres \ + --env POSTGRES_HOST_AUTH_METHOD=trust \ + --env POSTGRES_USER=authenticated \ + --env POSTGRES_DB=database \ + --publish 5432:5432 \ + postgres:17-bookworm +``` + +Create a configuration file called `local_proxy.json` in the root of the repo (used also by the auth broker to validate JWTs) +```sh +{ + "jwks": [ + { + "id": "1", + "role_names": ["authenticator", "authenticated", "anon"], + "jwks_url": "https://climbing-minnow-11.clerk.accounts.dev/.well-known/jwks.json", + "provider_name": "foo", + "jwt_audience": null + } + ] +} +``` + +Start the local proxy: +```sh +cargo run --bin local_proxy -- \ + --disable_pg_session_jwt true \ + --http 0.0.0.0:7432 +``` + +Start the auth broker: +```sh +LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \ + -c server.crt -k server.key \ + --is-auth-broker true \ + --wss 0.0.0.0:8080 \ + --http 0.0.0.0:7002 \ + --auth-backend local +``` + +Create a JWT in your auth provider (e.g. Clerk) and set it in the `NEON_JWT` environment variable. +```sh +export NEON_JWT="..." +``` + +Run a query against the auth broker: +```sh +curl -k "https://foo.local.neon.build:8080/sql" \ + -H "Authorization: Bearer $NEON_JWT" \ + -H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \ + -d '{"query":"select 1","params":[]}' +``` diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 8440d198df..f561df9202 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -164,21 +164,20 @@ async fn authenticate( })? .map_err(ConsoleRedirectError::from)?; - if auth_config.ip_allowlist_check_enabled { - if let Some(allowed_ips) = &db_info.allowed_ips { - if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); - } - } + if auth_config.ip_allowlist_check_enabled + && let Some(allowed_ips) = &db_info.allowed_ips + && !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) + { + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr())); } // Check if the access over the public internet is allowed, otherwise block. Note that // the console redirect is not behind the VPC service endpoint, so we don't need to check // the VPC endpoint ID. - if let Some(public_access_allowed) = db_info.public_access_allowed { - if !public_access_allowed { - return Err(auth::AuthError::NetworkNotAllowed); - } + if let Some(public_access_allowed) = db_info.public_access_allowed + && !public_access_allowed + { + return Err(auth::AuthError::NetworkNotAllowed); } client.write_message(BeMessage::NoticeResponse("Connecting to database.")); diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 5edc878243..a716890a00 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -399,36 +399,36 @@ impl JwkCacheEntryLock { tracing::debug!(?payload, "JWT signature valid with claims"); - if let Some(aud) = expected_audience { - if payload.audience.0.iter().all(|s| s != aud) { - return Err(JwtError::InvalidClaims( - JwtClaimsError::InvalidJwtTokenAudience, - )); - } + if let Some(aud) = expected_audience + && payload.audience.0.iter().all(|s| s != aud) + { + return Err(JwtError::InvalidClaims( + JwtClaimsError::InvalidJwtTokenAudience, + )); } let now = SystemTime::now(); - if let Some(exp) = payload.expiration { - if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( - exp.duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ))); - } + if let Some(exp) = payload.expiration + && now >= exp + CLOCK_SKEW_LEEWAY + { + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired( + exp.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ))); } - if let Some(nbf) = payload.not_before { - if nbf >= now + CLOCK_SKEW_LEEWAY { - return Err(JwtError::InvalidClaims( - JwtClaimsError::JwtTokenNotYetReadyToUse( - nbf.duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - )); - } + if let Some(nbf) = payload.not_before + && nbf >= now + CLOCK_SKEW_LEEWAY + { + return Err(JwtError::InvalidClaims( + JwtClaimsError::JwtTokenNotYetReadyToUse( + nbf.duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + )); } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 2e3013ead0..e7805d8bfe 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -171,7 +171,6 @@ impl ComputeUserInfo { pub(crate) enum ComputeCredentialKeys { AuthKeys(AuthKeys), JwtPayload(Vec), - None, } impl TryFrom for ComputeUserInfo { @@ -346,15 +345,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { Err(e) => { // The password could have been changed, so we invalidate the cache. // We should only invalidate the cache if the TTL might have expired. - if e.is_password_failed() { - #[allow(irrefutable_let_patterns)] - if let ControlPlaneClient::ProxyV1(api) = &*api { - if let Some(ep) = &user_info.endpoint_id { - api.caches - .project_info - .maybe_invalidate_role_secret(ep, &user_info.user); - } - } + if e.is_password_failed() + && let ControlPlaneClient::ProxyV1(api) = &*api + && let Some(ep) = &user_info.endpoint_id + { + api.caches + .project_info + .maybe_invalidate_role_secret(ep, &user_info.user); } Err(e) diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index 423ecf821e..401203d48c 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -1,43 +1,37 @@ use std::net::SocketAddr; use std::pin::pin; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, bail, ensure}; +use anyhow::bail; use arc_swap::ArcSwapOption; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8PathBuf; use clap::Parser; -use compute_api::spec::LocalProxySpec; use futures::future::Either; -use thiserror::Error; use tokio::net::TcpListener; use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; -use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend}; +use crate::auth::backend::local::LocalBackend; use crate::auth::{self}; use crate::cancellation::CancellationHandler; use crate::config::{ self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, + refresh_config_loop, }; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; -use crate::ext::TaskExt; use crate::http::health_server::AppMetrics; -use crate::intern::RoleNameInt; use crate::metrics::{Metrics, ThreadPoolMetrics}; use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}; use crate::scram::threadpool::ThreadPool; use crate::serverless::cancel_set::CancelSet; use crate::serverless::{self, GlobalConnPoolOptions}; use crate::tls::client_config::compute_client_config_with_root_certs; -use crate::types::RoleName; use crate::url::ApiUrl; project_git_version!(GIT_VERSION); @@ -82,6 +76,11 @@ struct LocalProxyCliArgs { /// Path of the local proxy PID file #[clap(long, default_value = "./local_proxy.pid")] pid_path: Utf8PathBuf, + /// Disable pg_session_jwt extension installation + /// This is useful for testing the local proxy with vanilla postgres. + #[clap(long, default_value = "false")] + #[cfg(feature = "testing")] + disable_pg_session_jwt: bool, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -282,6 +281,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, + #[cfg(feature = "testing")] + disable_pg_session_jwt: args.disable_pg_session_jwt, }))) } @@ -293,132 +294,3 @@ fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'stati Box::leak(Box::new(auth_backend)) } - -#[derive(Error, Debug)] -enum RefreshConfigError { - #[error(transparent)] - Read(#[from] std::io::Error), - #[error(transparent)] - Parse(#[from] serde_json::Error), - #[error(transparent)] - Validate(anyhow::Error), - #[error(transparent)] - Tls(anyhow::Error), -} - -async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { - let mut init = true; - loop { - rx.notified().await; - - match refresh_config_inner(config, &path).await { - Ok(()) => {} - // don't log for file not found errors if this is the first time we are checking - // for computes that don't use local_proxy, this is not an error. - Err(RefreshConfigError::Read(e)) - if init && e.kind() == std::io::ErrorKind::NotFound => - { - debug!(error=?e, ?path, "could not read config file"); - } - Err(RefreshConfigError::Tls(e)) => { - error!(error=?e, ?path, "could not read TLS certificates"); - } - Err(e) => { - error!(error=?e, ?path, "could not read config file"); - } - } - - init = false; - } -} - -async fn refresh_config_inner( - config: &ProxyConfig, - path: &Utf8Path, -) -> Result<(), RefreshConfigError> { - let bytes = tokio::fs::read(&path).await?; - let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - - let mut jwks_set = vec![]; - - fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { - let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - - ensure!( - jwks_url.has_authority() - && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); - - ensure!( - jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); - - // clear username, password and ports - jwks_url - .set_username("") - .expect("url can be a base and has a valid host and is not a file. should not error"); - jwks_url - .set_password(None) - .expect("url can be a base and has a valid host and is not a file. should not error"); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks_url.set_fragment(None); - jwks_url.query_pairs_mut().clear().finish(); - - if jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - Ok(JwksSettings { - id: jwks.id, - jwks_url, - _provider_name: jwks.provider_name, - jwt_audience: jwks.jwt_audience, - role_names: jwks - .role_names - .into_iter() - .map(RoleName::from) - .map(|s| RoleNameInt::from(&s)) - .collect(), - }) - } - - for jwks in data.jwks.into_iter().flatten() { - jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); - } - - info!("successfully loaded new config"); - JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); - - if let Some(tls_config) = data.tls { - let tls_config = tokio::task::spawn_blocking(move || { - crate::tls::server_config::configure_tls( - tls_config.key_path.as_ref(), - tls_config.cert_path.as_ref(), - None, - false, - ) - }) - .await - .propagate_task_panic() - .map_err(RefreshConfigError::Tls)?; - config.tls_config.store(Some(Arc::new(tls_config))); - } - - Ok(()) -} diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index b877aaddef..4ac8b6a995 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -4,6 +4,7 @@ //! This allows connecting to pods/services running in the same Kubernetes cluster from //! the outside. Similar to an ingress controller for HTTPS. +use std::io; use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; @@ -229,7 +230,6 @@ pub(super) async fn task_main( .set_nodelay(true) .context("failed to set socket option")?; - info!(%peer_addr, "serving"); let ctx = RequestContext::new( session_id, ConnectionInfo { @@ -241,6 +241,14 @@ pub(super) async fn task_main( handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await } .unwrap_or_else(|e| { + if let Some(FirstMessage(io_error)) = e.downcast_ref() { + // this is noisy. if we get EOF on the very first message that's likely + // just NLB doing a healthcheck. + if io_error.kind() == io::ErrorKind::UnexpectedEof { + return; + } + } + // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); }) @@ -257,12 +265,19 @@ pub(super) async fn task_main( Ok(()) } +#[derive(Debug, thiserror::Error)] +#[error(transparent)] +struct FirstMessage(io::Error); + async fn ssl_handshake( ctx: &RequestContext, raw_stream: S, tls_config: Arc, ) -> anyhow::Result> { - let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)).await?; + let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)) + .await + .map_err(FirstMessage)?; + match msg { FeStartupPacket::SslRequest { direct: None } => { let raw = stream.accept_tls().await?; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 2133f33a4d..691709ce2a 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -10,21 +10,29 @@ use std::time::Duration; use anyhow::Context; use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; +#[cfg(any(test, feature = "testing"))] +use camino::Utf8PathBuf; use futures::future::Either; use itertools::{Itertools, Position}; use rand::{Rng, thread_rng}; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; +#[cfg(any(test, feature = "testing"))] +use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{Instrument, error, info, warn}; +use tracing::{error, info, warn}; use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; +#[cfg(any(test, feature = "testing"))] +use crate::auth::backend::local::LocalBackend; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; use crate::batch::BatchQueue; use crate::cancellation::{CancellationHandler, CancellationProcessor}; +#[cfg(any(test, feature = "testing"))] +use crate::config::refresh_config_loop; use crate::config::{ self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, @@ -60,6 +68,9 @@ enum AuthBackendType { #[cfg(any(test, feature = "testing"))] Postgres, + + #[cfg(any(test, feature = "testing"))] + Local, } /// Neon proxy/router @@ -74,6 +85,10 @@ struct ProxyCliArgs { proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, + /// Path of the local proxy config file (used for local-file auth backend) + #[clap(long, default_value = "./local_proxy.json")] + #[cfg(any(test, feature = "testing"))] + config_path: Utf8PathBuf, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] mgmt: SocketAddr, @@ -180,7 +195,9 @@ struct ProxyCliArgs { #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] project_info_cache: String, /// cache for all valid endpoints - #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + // TODO: remove after a couple of releases. + #[clap(long, default_value_t = String::new())] + #[deprecated] endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, @@ -226,6 +243,14 @@ struct ProxyCliArgs { #[clap(flatten)] pg_sni_router: PgSniRouterArgs, + + /// if this is not local proxy, this toggles whether we accept Postgres REST requests + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_rest_broker: bool, + + /// cache for `db_schema_cache` introspection (use `size=0` to disable) + #[clap(long, default_value = "size=1000,ttl=1h")] + db_schema_cache: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -386,6 +411,8 @@ pub async fn run() -> anyhow::Result<()> { 64, )); + #[cfg(any(test, feature = "testing"))] + let refresh_config_notify = Arc::new(Notify::new()); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -412,6 +439,17 @@ pub async fn run() -> anyhow::Result<()> { endpoint_rate_limiter.clone(), )); } + + // if auth backend is local, we need to load the config file + #[cfg(any(test, feature = "testing"))] + if let auth::Backend::Local(_) = &auth_backend { + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop( + config, + args.config_path, + refresh_config_notify.clone(), + )); + } } Either::Right(auth_backend) => { if let Some(proxy_listener) = proxy_listener { @@ -462,7 +500,13 @@ pub async fn run() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {})); + + maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), { + move || { + #[cfg(any(test, feature = "testing"))] + refresh_config_notify.notify_one(); + } + })); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { @@ -478,52 +522,42 @@ pub async fn run() -> anyhow::Result<()> { maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); } - #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] - if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { - if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - if let Some(client) = redis_client { - // project info cache and invalidation of that cache. - let cache = api.caches.project_info.clone(); - maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone())); - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend + && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api + && let Some(client) = redis_client + { + // project info cache and invalidation of that cache. + let cache = api.caches.project_info.clone(); + maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone())); + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. - // This prevents immediate exit and pod restart, - // which can cause hammering of the redis in case of connection issues. - // cancellation key management - let mut redis_kv_client = RedisKVClient::new(client.clone()); - for attempt in (0..3).with_position() { - match redis_kv_client.try_connect().await { - Ok(()) => { - info!("Connected to Redis KV client"); - cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor { - client: redis_kv_client, - batch_size: args.cancellation_batch_size, - })); + // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. + // This prevents immediate exit and pod restart, + // which can cause hammering of the redis in case of connection issues. + // cancellation key management + let mut redis_kv_client = RedisKVClient::new(client.clone()); + for attempt in (0..3).with_position() { + match redis_kv_client.try_connect().await { + Ok(()) => { + info!("Connected to Redis KV client"); + cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor { + client: redis_kv_client, + batch_size: args.cancellation_batch_size, + })); - break; - } - Err(e) => { - error!("Failed to connect to Redis KV client: {e}"); - if matches!(attempt, Position::Last(_)) { - bail!( - "Failed to connect to Redis KV client after {} attempts", - attempt.into_inner() - ); - } - let jitter = thread_rng().gen_range(0..100); - tokio::time::sleep(Duration::from_millis(1000 + jitter)).await; - } - } + break; + } + Err(e) => { + error!("Failed to connect to Redis KV client: {e}"); + if matches!(attempt, Position::Last(_)) { + bail!( + "Failed to connect to Redis KV client after {} attempts", + attempt.into_inner() + ); + } + let jitter = thread_rng().gen_range(0..100); + tokio::time::sleep(Duration::from_millis(1000 + jitter)).await; } - - // listen for notifications of new projects/endpoints/branches - let cache = api.caches.endpoints_cache.clone(); - let span = tracing::info_span!("endpoints_cache"); - maintenance_tasks.spawn( - async move { cache.do_read(client, cancellation_token.clone()).await } - .instrument(span), - ); } } } @@ -653,6 +687,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, + #[cfg(feature = "testing")] + disable_pg_session_jwt: false, }; let config = Box::leak(Box::new(config)); @@ -671,18 +707,15 @@ fn build_auth_backend( let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, - endpoint_cache_config, ))); let config::ConcurrencyLockOptions { @@ -752,18 +785,15 @@ fn build_auth_backend( let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; - let endpoint_cache_config: config::EndpointCacheConfig = - args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); info!( "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" ); - info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( wake_compute_cache_config, project_info_cache_config, - endpoint_cache_config, ))); let config::ConcurrencyLockOptions { @@ -806,6 +836,19 @@ fn build_auth_backend( Ok(Either::Right(config)) } + + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Local => { + let postgres: SocketAddr = "127.0.0.1:7432".parse()?; + let compute_ctl: ApiUrl = "http://127.0.0.1:3081/".parse()?; + let auth_backend = crate::auth::Backend::Local( + crate::auth::backend::MaybeOwned::Owned(LocalBackend::new(postgres, compute_ctl)), + ); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } } } diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs deleted file mode 100644 index 3c88e07484..0000000000 --- a/proxy/src/cache/endpoints.rs +++ /dev/null @@ -1,283 +0,0 @@ -use std::convert::Infallible; -use std::future::pending; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; - -use clashmap::ClashSet; -use redis::streams::{StreamReadOptions, StreamReadReply}; -use redis::{AsyncCommands, FromRedisValue, Value}; -use serde::Deserialize; -use tokio_util::sync::CancellationToken; -use tracing::info; - -use crate::config::EndpointCacheConfig; -use crate::context::RequestContext; -use crate::ext::LockExt; -use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; -use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; -use crate::rate_limiter::GlobalRateLimiter; -use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::types::EndpointId; - -// TODO: this could be an enum, but events in Redis need to be fixed first. -// ProjectCreated was sent with type:branch_created. So we ignore type. -#[derive(Deserialize, Debug, Clone, PartialEq)] -struct ControlPlaneEvent { - endpoint_created: Option, - branch_created: Option, - project_created: Option, - #[serde(rename = "type")] - _type: Option, -} - -#[derive(Deserialize, Debug, Clone, PartialEq)] -struct EndpointCreated { - endpoint_id: EndpointIdInt, -} - -#[derive(Deserialize, Debug, Clone, PartialEq)] -struct BranchCreated { - branch_id: BranchIdInt, -} - -#[derive(Deserialize, Debug, Clone, PartialEq)] -struct ProjectCreated { - project_id: ProjectIdInt, -} - -impl TryFrom<&Value> for ControlPlaneEvent { - type Error = anyhow::Error; - fn try_from(value: &Value) -> Result { - let json = String::from_redis_value(value)?; - Ok(serde_json::from_str(&json)?) - } -} - -pub struct EndpointsCache { - config: EndpointCacheConfig, - endpoints: ClashSet, - branches: ClashSet, - projects: ClashSet, - ready: AtomicBool, - limiter: Arc>, -} - -impl EndpointsCache { - pub(crate) fn new(config: EndpointCacheConfig) -> Self { - Self { - limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( - config.limiter_info.clone(), - ))), - config, - endpoints: ClashSet::new(), - branches: ClashSet::new(), - projects: ClashSet::new(), - ready: AtomicBool::new(false), - } - } - - pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool { - if !self.ready.load(Ordering::Acquire) { - // the endpoint cache is not yet fully initialised. - return true; - } - - if !self.should_reject(endpoint) { - ctx.set_rejected(false); - return true; - } - - // report that we might want to reject this endpoint - ctx.set_rejected(true); - - // If cache is disabled, just collect the metrics and return. - if self.config.disable_cache { - return true; - } - - // If the limiter allows, we can pretend like it's valid - // (incase it is, due to redis channel lag). - if self.limiter.lock_propagate_poison().check() { - return true; - } - - // endpoint not found, and there's too much load. - false - } - - fn should_reject(&self, endpoint: &EndpointId) -> bool { - if endpoint.is_endpoint() { - let Some(endpoint) = EndpointIdInt::get(endpoint) else { - // if we haven't interned this endpoint, it's not in the cache. - return true; - }; - !self.endpoints.contains(&endpoint) - } else if endpoint.is_branch() { - let Some(branch) = BranchIdInt::get(endpoint) else { - // if we haven't interned this branch, it's not in the cache. - return true; - }; - !self.branches.contains(&branch) - } else { - let Some(project) = ProjectIdInt::get(endpoint) else { - // if we haven't interned this project, it's not in the cache. - return true; - }; - !self.projects.contains(&project) - } - } - - fn insert_event(&self, event: ControlPlaneEvent) { - if let Some(endpoint_created) = event.endpoint_created { - self.endpoints.insert(endpoint_created.endpoint_id); - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::EndpointCreated); - } else if let Some(branch_created) = event.branch_created { - self.branches.insert(branch_created.branch_id); - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::BranchCreated); - } else if let Some(project_created) = event.project_created { - self.projects.insert(project_created.project_id); - Metrics::get() - .proxy - .redis_events_count - .inc(RedisEventsCount::ProjectCreated); - } - } - - pub async fn do_read( - &self, - mut con: ConnectionWithCredentialsProvider, - cancellation_token: CancellationToken, - ) -> anyhow::Result { - let mut last_id = "0-0".to_string(); - loop { - if let Err(e) = con.connect().await { - tracing::error!("error connecting to redis: {:?}", e); - self.ready.store(false, Ordering::Release); - } - if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { - tracing::error!("error reading from redis: {:?}", e); - self.ready.store(false, Ordering::Release); - } - if cancellation_token.is_cancelled() { - info!("cancellation token is cancelled, exiting"); - // Maintenance tasks run forever. Sleep forever when canceled. - pending::<()>().await; - } - tokio::time::sleep(self.config.retry_interval).await; - } - } - - async fn read_from_stream( - &self, - con: &mut ConnectionWithCredentialsProvider, - last_id: &mut String, - ) -> anyhow::Result<()> { - tracing::info!("reading endpoints/branches/projects from redis"); - self.batch_read( - con, - StreamReadOptions::default().count(self.config.initial_batch_size), - last_id, - true, - ) - .await?; - tracing::info!("ready to filter user requests"); - self.ready.store(true, Ordering::Release); - self.batch_read( - con, - StreamReadOptions::default() - .count(self.config.default_batch_size) - .block(self.config.xread_timeout.as_millis() as usize), - last_id, - false, - ) - .await - } - - async fn batch_read( - &self, - conn: &mut ConnectionWithCredentialsProvider, - opts: StreamReadOptions, - last_id: &mut String, - return_when_finish: bool, - ) -> anyhow::Result<()> { - let mut total: usize = 0; - loop { - let mut res: StreamReadReply = conn - .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) - .await?; - - if res.keys.is_empty() { - if return_when_finish { - if total != 0 { - break; - } - anyhow::bail!( - "Redis stream {} is empty, cannot be used to filter endpoints", - self.config.stream_name - ); - } - // If we are not returning when finish, we should wait for more data. - continue; - } - if res.keys.len() != 1 { - anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); - } - - let key = res.keys.pop().expect("Checked length above"); - let len = key.ids.len(); - for stream_id in key.ids { - total += 1; - for value in stream_id.map.values() { - match value.try_into() { - Ok(event) => self.insert_event(event), - Err(err) => { - Metrics::get().proxy.redis_errors_total.inc(RedisErrors { - channel: &self.config.stream_name, - }); - tracing::error!("error parsing value {value:?}: {err:?}"); - } - } - } - if total.is_power_of_two() { - tracing::debug!("endpoints read {}", total); - } - *last_id = stream_id.id; - } - if return_when_finish && len <= self.config.default_batch_size { - break; - } - } - tracing::info!("read {} endpoints/branches/projects from redis", total); - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_control_plane_event() { - let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#; - - let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into(); - - assert_eq!( - serde_json::from_str::(s).unwrap(), - ControlPlaneEvent { - endpoint_created: Some(EndpointCreated { - endpoint_id: endpoint_id.into(), - }), - branch_created: None, - project_created: None, - _type: Some("endpoint_created".into()), - } - ); - } -} diff --git a/proxy/src/cache/mod.rs b/proxy/src/cache/mod.rs index 6c168144a7..ce7f781213 100644 --- a/proxy/src/cache/mod.rs +++ b/proxy/src/cache/mod.rs @@ -1,5 +1,4 @@ pub(crate) mod common; -pub(crate) mod endpoints; pub(crate) mod project_info; mod timed_lru; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index ffc0cf43f1..74413f1a7d 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -64,6 +64,13 @@ impl Pipeline { let responses = self.replies; let batch_size = self.inner.len(); + if !client.credentials_refreshed() { + tracing::debug!( + "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." + ); + tokio::time::sleep(Duration::from_secs(5)).await; + } + match client.query(&self.inner).await { // for each reply, we expect that many values. Ok(Value::Array(values)) if values.len() == responses => { @@ -127,6 +134,14 @@ impl QueueProcessing for CancellationProcessor { } async fn apply(&mut self, batch: Vec) -> Vec { + if !self.client.credentials_refreshed() { + // this will cause a timeout for cancellation operations + tracing::debug!( + "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." + ); + tokio::time::sleep(Duration::from_secs(5)).await; + } + let mut pipeline = Pipeline::with_capacity(batch.len()); let batch_size = batch.len(); diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 0a19090ce0..7b9183b05e 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -165,7 +165,7 @@ impl AuthInfo { ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => { Some(Auth::Scram(Box::new(auth_keys))) } - ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None, + ComputeCredentialKeys::JwtPayload(_) => None, }, server_params: StartupMessageParams::default(), skip_db_user: false, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index cee15ac7fa..6157dc8a6a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -4,17 +4,26 @@ use std::time::Duration; use anyhow::{Context, Ok, bail, ensure}; use arc_swap::ArcSwapOption; +use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; +use compute_api::spec::LocalProxySpec; use remote_storage::RemoteStorageConfig; +use thiserror::Error; +use tokio::sync::Notify; +use tracing::{debug, error, info, warn}; use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::local::JWKS_ROLE_MAP; use crate::control_plane::locks::ApiLocks; -use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; +use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::ext::TaskExt; +use crate::intern::RoleNameInt; +use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig}; use crate::scram::threadpool::ThreadPool; use crate::serverless::GlobalConnPoolOptions; use crate::serverless::cancel_set::CancelSet; pub use crate::tls::server_config::{TlsConfig, configure_tls}; -use crate::types::Host; +use crate::types::{Host, RoleName}; pub struct ProxyConfig { pub tls_config: ArcSwapOption, @@ -26,6 +35,8 @@ pub struct ProxyConfig { pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute: ComputeConfig, + #[cfg(feature = "testing")] + pub disable_pg_session_jwt: bool, } pub struct ComputeConfig { @@ -69,79 +80,6 @@ pub struct AuthenticationConfig { pub console_redirect_confirmation_timeout: tokio::time::Duration, } -#[derive(Debug)] -pub struct EndpointCacheConfig { - /// Batch size to receive all endpoints on the startup. - pub initial_batch_size: usize, - /// Batch size to receive endpoints. - pub default_batch_size: usize, - /// Timeouts for the stream read operation. - pub xread_timeout: Duration, - /// Stream name to read from. - pub stream_name: String, - /// Limiter info (to distinguish when to enable cache). - pub limiter_info: Vec, - /// Disable cache. - /// If true, cache is ignored, but reports all statistics. - pub disable_cache: bool, - /// Retry interval for the stream read operation. - pub retry_interval: Duration, -} - -impl EndpointCacheConfig { - /// Default options for [`crate::control_plane::NodeInfoCache`]. - /// Notice that by default the limiter is empty, which means that cache is disabled. - pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; - - /// Parse cache options passed via cmdline. - /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. - fn parse(options: &str) -> anyhow::Result { - let mut initial_batch_size = None; - let mut default_batch_size = None; - let mut xread_timeout = None; - let mut stream_name = None; - let mut limiter_info = vec![]; - let mut disable_cache = false; - let mut retry_interval = None; - - for option in options.split(',') { - let (key, value) = option - .split_once('=') - .with_context(|| format!("bad key-value pair: {option}"))?; - - match key { - "initial_batch_size" => initial_batch_size = Some(value.parse()?), - "default_batch_size" => default_batch_size = Some(value.parse()?), - "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), - "stream_name" => stream_name = Some(value.to_string()), - "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), - "disable_cache" => disable_cache = value.parse()?, - "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), - unknown => bail!("unknown key: {unknown}"), - } - } - RateBucketInfo::validate(&mut limiter_info)?; - - Ok(Self { - initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, - default_batch_size: default_batch_size.context("missing `default_batch_size`")?, - xread_timeout: xread_timeout.context("missing `xread_timeout`")?, - stream_name: stream_name.context("missing `stream_name`")?, - disable_cache, - limiter_info, - retry_interval: retry_interval.context("missing `retry_interval`")?, - }) - } -} - -impl FromStr for EndpointCacheConfig { - type Err = anyhow::Error; - - fn from_str(options: &str) -> Result { - let error = || format!("failed to parse endpoint cache options '{options}'"); - Self::parse(options).with_context(error) - } -} #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub remote_storage_config: Option, @@ -409,6 +347,135 @@ impl FromStr for ConcurrencyLockOptions { } } +#[derive(Error, Debug)] +pub(crate) enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), + #[error(transparent)] + Tls(anyhow::Error), +} + +pub(crate) async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { + let mut init = true; + loop { + rx.notified().await; + + match refresh_config_inner(config, &path).await { + std::result::Result::Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } + Err(RefreshConfigError::Tls(e)) => { + error!(error=?e, ?path, "could not read TLS certificates"); + } + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } + + init = false; + } +} + +pub(crate) async fn refresh_config_inner( + config: &ProxyConfig, + path: &Utf8Path, +) -> Result<(), RefreshConfigError> { + let bytes = tokio::fs::read(&path).await?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; + + let mut jwks_set = vec![]; + + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; + + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + Ok(JwksSettings { + id: jwks.id, + jwks_url, + _provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) + } + + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + + if let Some(tls_config) = data.tls { + let tls_config = tokio::task::spawn_blocking(move || { + crate::tls::server_config::configure_tls( + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), + None, + false, + ) + }) + .await + .propagate_task_panic() + .map_err(RefreshConfigError::Tls)?; + config.tls_config.store(Some(Arc::new(tls_config))); + } + + std::result::Result::Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index df1c4e194a..3a8828e70c 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -7,7 +7,7 @@ use once_cell::sync::OnceCell; use smol_str::SmolStr; use tokio::sync::mpsc; use tracing::field::display; -use tracing::{Span, debug, error, info_span}; +use tracing::{Span, error, info_span}; use try_lock::TryLock; use uuid::Uuid; @@ -15,10 +15,7 @@ use self::parquet::RequestData; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::error::ErrorKind; use crate::intern::{BranchIdInt, ProjectIdInt}; -use crate::metrics::{ - ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol, - Waiting, -}; +use crate::metrics::{LatencyAccumulated, LatencyTimer, Metrics, Protocol, Waiting}; use crate::pqproto::StartupMessageParams; use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra}; use crate::types::{DbName, EndpointId, RoleName}; @@ -70,8 +67,6 @@ struct RequestContextInner { // This sender is only used to log the length of session in case of success. disconnect_sender: Option>, pub(crate) latency_timer: LatencyTimer, - // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. - rejected: Option, disconnect_timestamp: Option>, } @@ -106,7 +101,6 @@ impl Clone for RequestContext { auth_method: inner.auth_method.clone(), jwt_issuer: inner.jwt_issuer.clone(), success: inner.success, - rejected: inner.rejected, cold_start_info: inner.cold_start_info, pg_options: inner.pg_options.clone(), testodrome_query_id: inner.testodrome_query_id.clone(), @@ -151,7 +145,6 @@ impl RequestContext { auth_method: None, jwt_issuer: None, success: false, - rejected: None, cold_start_info: ColdStartInfo::Unknown, pg_options: None, testodrome_query_id: None, @@ -183,11 +176,6 @@ impl RequestContext { ) } - pub(crate) fn set_rejected(&self, rejected: bool) { - let mut this = self.0.try_lock().expect("should not deadlock"); - this.rejected = Some(rejected); - } - pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) { self.0 .try_lock() @@ -209,11 +197,9 @@ impl RequestContext { if let Some(options_str) = options.get("options") { // If not found directly, try to extract it from the options string for option in options_str.split_whitespace() { - if option.starts_with("neon_query_id:") { - if let Some(value) = option.strip_prefix("neon_query_id:") { - this.set_testodrome_id(value.into()); - break; - } + if let Some(value) = option.strip_prefix("neon_query_id:") { + this.set_testodrome_id(value.into()); + break; } } } @@ -463,38 +449,6 @@ impl RequestContextInner { } fn log_connect(&mut self) { - let outcome = if self.success { - ConnectOutcome::Success - } else { - ConnectOutcome::Failed - }; - - // TODO: get rid of entirely/refactor - // check for false positives - // AND false negatives - if let Some(rejected) = self.rejected { - let ep = self - .endpoint_id - .as_ref() - .map(|x| x.as_str()) - .unwrap_or_default(); - // This makes sense only if cache is disabled - debug!( - ?outcome, - ?rejected, - ?ep, - "check endpoint is valid with outcome" - ); - Metrics::get() - .proxy - .invalid_endpoints_total - .inc(InvalidEndpointsGroup { - protocol: self.protocol, - rejected: rejected.into(), - outcome, - }); - } - if let Some(tx) = self.sender.take() { // If type changes, this error handling needs to be updated. let tx: mpsc::UnboundedSender = tx; diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index 8c76d034f7..fc263b73b1 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -159,13 +159,6 @@ impl NeonControlPlaneClient { ctx: &RequestContext, endpoint: &EndpointId, ) -> Result, GetEndpointJwksError> { - if !self - .caches - .endpoints_cache - .is_valid(ctx, &endpoint.normalize()) - { - return Err(GetEndpointJwksError::EndpointNotFound); - } let request_id = ctx.session_id().to_string(); async { let request = self @@ -250,10 +243,8 @@ impl NeonControlPlaneClient { info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response.status(), response.bytes().await?)?; - // Unfortunately, ownership won't let us use `Option::ok_or` here. - let (host, port) = match parse_host_port(&body.address) { - None => return Err(WakeComputeError::BadComputeAddress(body.address)), - Some(x) => x, + let Some((host, port)) = parse_host_port(&body.address) else { + return Err(WakeComputeError::BadComputeAddress(body.address)); }; let host_addr = IpAddr::from_str(host).ok(); @@ -302,11 +293,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { return Ok(secret); } - if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) { - info!("endpoint is not valid, skipping the request"); - return Err(GetAuthInfoError::UnknownEndpoint); - } - let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?; let control = EndpointAccessControl { @@ -348,11 +334,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient { return Ok(control); } - if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) { - info!("endpoint is not valid, skipping the request"); - return Err(GetAuthInfoError::UnknownEndpoint); - } - let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?; let control = EndpointAccessControl { diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index 4e5f5c7899..ecd4db29b2 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -13,9 +13,8 @@ use tracing::{debug, info}; use super::{EndpointAccessControl, RoleAccessControl}; use crate::auth::backend::ComputeUserInfo; use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; -use crate::cache::endpoints::EndpointsCache; use crate::cache::project_info::ProjectInfoCacheImpl; -use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; +use crate::config::{CacheOptions, ProjectInfoCacheOptions}; use crate::context::RequestContext; use crate::control_plane::{CachedNodeInfo, ControlPlaneApi, NodeInfoCache, errors}; use crate::error::ReportableError; @@ -121,15 +120,12 @@ pub struct ApiCaches { pub(crate) node_info: NodeInfoCache, /// Cache which stores project_id -> endpoint_ids mapping. pub project_info: Arc, - /// List of all valid endpoints. - pub endpoints_cache: Arc, } impl ApiCaches { pub fn new( wake_compute_cache_config: CacheOptions, project_info_cache_config: ProjectInfoCacheOptions, - endpoint_cache_config: EndpointCacheConfig, ) -> Self { Self { node_info: NodeInfoCache::new( @@ -139,7 +135,6 @@ impl ApiCaches { true, ), project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), - endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), } } } @@ -213,7 +208,12 @@ impl ApiLocks { self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); - debug!("acquired permit {:?}", now.elapsed().as_secs_f64()); + + if permit.is_ok() { + debug!(elapsed = ?now.elapsed(), "acquired permit"); + } else { + debug!(elapsed = ?now.elapsed(), "timed out acquiring permit"); + } Ok(WakeComputePermit { permit: permit? }) } diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs index 77312c89c5..f640657d90 100644 --- a/proxy/src/control_plane/errors.rs +++ b/proxy/src/control_plane/errors.rs @@ -99,10 +99,6 @@ pub(crate) enum GetAuthInfoError { #[error(transparent)] ApiError(ControlPlaneError), - - /// Proxy does not know about the endpoint in advanced - #[error("endpoint not found in endpoint cache")] - UnknownEndpoint, } // This allows more useful interactions than `#[from]`. @@ -119,8 +115,6 @@ impl UserFacingError for GetAuthInfoError { Self::BadSecret => REQUEST_FAILED.to_owned(), // However, API might return a meaningful error. Self::ApiError(e) => e.to_string_client(), - // pretend like control plane returned an error. - Self::UnknownEndpoint => REQUEST_FAILED.to_owned(), } } } @@ -130,8 +124,6 @@ impl ReportableError for GetAuthInfoError { match self { Self::BadSecret => crate::error::ErrorKind::ControlPlane, Self::ApiError(_) => crate::error::ErrorKind::ControlPlane, - // we only apply endpoint filtering if control plane is under high load. - Self::UnknownEndpoint => crate::error::ErrorKind::ServiceRateLimit, } } } @@ -200,9 +192,6 @@ impl CouldRetry for WakeComputeError { #[derive(Debug, Error)] pub enum GetEndpointJwksError { - #[error("endpoint not found")] - EndpointNotFound, - #[error("failed to build control plane request: {0}")] RequestBuild(#[source] reqwest::Error), diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a58b55a704..e608300bd2 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -52,7 +52,7 @@ pub async fn init() -> anyhow::Result { StderrWriter { stderr: std::io::stderr(), }, - &["request_id", "session_id", "conn_id"], + &["conn_id", "ep", "query_id", "request_id", "session_id"], )) } else { None @@ -271,18 +271,18 @@ where }); // In case logging fails we generate a simpler JSON object. - if let Err(err) = res { - if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( { + if let Err(err) = res + && let Ok(mut line) = serde_json::to_vec(&serde_json::json!( { "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true), "level": "ERROR", "message": format_args!("cannot log event: {err:?}"), "fields": { "event": format_args!("{event:?}"), }, - })) { - line.push(b'\n'); - self.writer.make_writer().write_all(&line).ok(); - } + })) + { + line.push(b'\n'); + self.writer.make_writer().write_all(&line).ok(); } } @@ -583,10 +583,11 @@ impl EventFormatter { THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?; // TODO: tls cache? name could change - if let Some(thread_name) = std::thread::current().name() { - if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" { - serializer.serialize_entry("thread_name", thread_name)?; - } + if let Some(thread_name) = std::thread::current().name() + && !thread_name.is_empty() + && thread_name != "tokio-runtime-worker" + { + serializer.serialize_entry("thread_name", thread_name)?; } if let Some(task_id) = tokio::task::try_id() { @@ -596,10 +597,10 @@ impl EventFormatter { serializer.serialize_entry("target", meta.target())?; // Skip adding module if it's the same as target. - if let Some(module) = meta.module_path() { - if module != meta.target() { - serializer.serialize_entry("module", module)?; - } + if let Some(module) = meta.module_path() + && module != meta.target() + { + serializer.serialize_entry("module", module)?; } if let Some(file) = meta.file() { diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 4c340edfd5..9d1a3d4358 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -10,7 +10,7 @@ use measured::{ Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; -use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec}; use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; @@ -36,7 +36,6 @@ impl Metrics { metrics.proxy.redis_errors_total.init_all_dense(); metrics.proxy.redis_events_count.init_all_dense(); metrics.proxy.retries_metric.init_all_dense(); - metrics.proxy.invalid_endpoints_total.init_all_dense(); metrics.proxy.connection_failures_total.init_all_dense(); SELF.set(metrics) @@ -80,11 +79,6 @@ pub struct ProxyMetrics { )] pub console_request_latency: HistogramVec, - /// Time it takes to acquire a token to call console plane. - // largest bucket = 3^16 * 0.05ms = 2.15s - #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] - pub control_plane_token_acquire_seconds: Histogram<16>, - /// Size of the HTTP request body lengths. // smallest bucket = 16 bytes // largest bucket = 4^12 * 16 bytes = 256MB @@ -98,19 +92,10 @@ pub struct ProxyMetrics { /// Number of opened connections to a database. pub http_pool_opened_connections: Gauge, - /// Number of cache hits/misses for allowed ips. - pub allowed_ips_cache_misses: CounterVec>, - /// Number of allowed ips #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, - /// Number of cache hits/misses for VPC endpoint IDs. - pub vpc_endpoint_id_cache_stats: CounterVec>, - - /// Number of cache hits/misses for access blocker flags. - pub access_blocker_flags_cache_stats: CounterVec>, - /// Number of allowed VPC endpoints IDs #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, @@ -139,21 +124,12 @@ pub struct ProxyMetrics { /// Number of TLS handshake failures pub tls_handshake_failures: Counter, - /// Number of connection requests affected by authentication rate limits - pub requests_auth_rate_limits_total: Counter, - /// HLL approximate cardinality of endpoints that are connecting pub connecting_endpoints: HyperLogLogVec, 32>, /// Number of endpoints affected by errors of a given classification pub endpoints_affected_by_errors: HyperLogLogVec, 32>, - /// Number of endpoints affected by authentication rate limits - pub endpoints_auth_rate_limits: HyperLogLog<32>, - - /// Number of invalid endpoints (per protocol, per rejected). - pub invalid_endpoints_total: CounterVec, - /// Number of retries (per outcome, per retry_type). #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] pub retries_metric: HistogramVec, @@ -236,13 +212,6 @@ pub enum Bool { False, } -#[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "outcome")] -pub enum Outcome { - Success, - Failed, -} - #[derive(FixedCardinalityLabel, Copy, Clone)] #[label(singleton = "outcome")] pub enum CacheOutcome { diff --git a/proxy/src/pglb/copy_bidirectional.rs b/proxy/src/pglb/copy_bidirectional.rs index 97f8d7c6af..5e4262a323 100644 --- a/proxy/src/pglb/copy_bidirectional.rs +++ b/proxy/src/pglb/copy_bidirectional.rs @@ -90,27 +90,27 @@ where // TODO: 1 info log, with a enum label for close direction. // Early termination checks from compute to client. - if let TransferState::Done(_) = compute_to_client { - if let TransferState::Running(buf) = &client_to_compute { - info!("Compute is done, terminate client"); - // Initiate shutdown - client_to_compute = TransferState::ShuttingDown(buf.amt); - client_to_compute_result = - transfer_one_direction(cx, &mut client_to_compute, client, compute) - .map_err(ErrorSource::from_client)?; - } + if let TransferState::Done(_) = compute_to_client + && let TransferState::Running(buf) = &client_to_compute + { + info!("Compute is done, terminate client"); + // Initiate shutdown + client_to_compute = TransferState::ShuttingDown(buf.amt); + client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute) + .map_err(ErrorSource::from_client)?; } // Early termination checks from client to compute. - if let TransferState::Done(_) = client_to_compute { - if let TransferState::Running(buf) = &compute_to_client { - info!("Client is done, terminate compute"); - // Initiate shutdown - compute_to_client = TransferState::ShuttingDown(buf.amt); - compute_to_client_result = - transfer_one_direction(cx, &mut compute_to_client, compute, client) - .map_err(ErrorSource::from_compute)?; - } + if let TransferState::Done(_) = client_to_compute + && let TransferState::Running(buf) = &compute_to_client + { + info!("Client is done, terminate compute"); + // Initiate shutdown + compute_to_client = TransferState::ShuttingDown(buf.amt); + compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, compute, client) + .map_err(ErrorSource::from_compute)?; } // It is not a problem if ready! returns early ... (comment remains the same) diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index f7e54ebfe7..12b4bda0c0 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -39,7 +39,11 @@ impl LeakyBucketRateLimiter { let config = config.map_or(self.default_config, Into::into); - if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + if self + .access_count + .fetch_add(1, Ordering::AcqRel) + .is_multiple_of(2048) + { self.do_gc(now); } diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 2e40f5bf60..fd1b2af023 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -16,44 +16,6 @@ use super::LeakyBucketConfig; use crate::ext::LockExt; use crate::intern::EndpointIdInt; -pub struct GlobalRateLimiter { - data: Vec, - info: Vec, -} - -impl GlobalRateLimiter { - pub fn new(info: Vec) -> Self { - Self { - data: vec![ - RateBucket { - start: Instant::now(), - count: 0, - }; - info.len() - ], - info, - } - } - - /// Check that number of connections is below `max_rps` rps. - pub fn check(&mut self) -> bool { - let now = Instant::now(); - - let should_allow_request = self - .data - .iter_mut() - .zip(&self.info) - .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); - - if should_allow_request { - // only increment the bucket counts if the request will actually be accepted - self.data.iter_mut().for_each(|b| b.inc(1)); - } - - should_allow_request - } -} - // Simple per-endpoint rate limiter. // // Check that number of connections to the endpoint is below `max_rps` rps. @@ -211,7 +173,11 @@ impl BucketRateLimiter { // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) // = 30MB - if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 { + if self + .access_count + .fetch_add(1, Ordering::AcqRel) + .is_multiple_of(2048) + { self.do_gc(); } diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 112b95873a..828bb63aac 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -8,4 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd; pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; -pub use limiter::{GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; +pub use limiter::{RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs deleted file mode 100644 index 6f56aeea06..0000000000 --- a/proxy/src/redis/cancellation_publisher.rs +++ /dev/null @@ -1,79 +0,0 @@ -use core::net::IpAddr; -use std::sync::Arc; - -use tokio::sync::Mutex; -use uuid::Uuid; - -use crate::pqproto::CancelKeyData; - -pub trait CancellationPublisherMut: Send + Sync + 'static { - #[allow(async_fn_in_trait)] - async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()>; -} - -pub trait CancellationPublisher: Send + Sync + 'static { - #[allow(async_fn_in_trait)] - async fn try_publish( - &self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()>; -} - -impl CancellationPublisher for () { - async fn try_publish( - &self, - _cancel_key_data: CancelKeyData, - _session_id: Uuid, - _peer_addr: IpAddr, - ) -> anyhow::Result<()> { - Ok(()) - } -} - -impl CancellationPublisherMut for P { - async fn try_publish( - &mut self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { -

::try_publish(self, cancel_key_data, session_id, peer_addr) - .await - } -} - -impl CancellationPublisher for Option

{ - async fn try_publish( - &self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - if let Some(p) = self { - p.try_publish(cancel_key_data, session_id, peer_addr).await - } else { - Ok(()) - } - } -} - -impl CancellationPublisher for Arc> { - async fn try_publish( - &self, - cancel_key_data: CancelKeyData, - session_id: Uuid, - peer_addr: IpAddr, - ) -> anyhow::Result<()> { - self.lock() - .await - .try_publish(cancel_key_data, session_id, peer_addr) - .await - } -} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index fe656557ac..35a3fe4334 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,11 +1,12 @@ use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; use futures::FutureExt; use redis::aio::{ConnectionLike, MultiplexedConnection}; use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; -use tokio::task::JoinHandle; -use tracing::{debug, error, info, warn}; +use tokio::task::AbortHandle; +use tracing::{error, info, warn}; use super::elasticache::CredentialsProvider; @@ -31,8 +32,9 @@ pub struct ConnectionWithCredentialsProvider { credentials: Credentials, // TODO: with more load on the connection, we should consider using a connection pool con: Option, - refresh_token_task: Option>, + refresh_token_task: Option, mutex: tokio::sync::Mutex<()>, + credentials_refreshed: Arc, } impl Clone for ConnectionWithCredentialsProvider { @@ -42,6 +44,7 @@ impl Clone for ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(false)), } } } @@ -65,6 +68,7 @@ impl ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(false)), } } @@ -78,6 +82,7 @@ impl ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(true)), } } @@ -85,6 +90,10 @@ impl ConnectionWithCredentialsProvider { redis::cmd("PING").query_async(con).await } + pub(crate) fn credentials_refreshed(&self) -> bool { + self.credentials_refreshed.load(Ordering::Relaxed) + } + pub(crate) async fn connect(&mut self) -> anyhow::Result<()> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { @@ -112,13 +121,13 @@ impl ConnectionWithCredentialsProvider { if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); - let f = tokio::spawn(async move { - Self::keep_connection(con2, credentials_provider) - .await - .inspect_err(|e| debug!("keep_connection failed: {e}")) - .ok(); - }); - self.refresh_token_task = Some(f); + let credentials_refreshed = self.credentials_refreshed.clone(); + let f = tokio::spawn(Self::keep_connection( + con2, + credentials_provider, + credentials_refreshed, + )); + self.refresh_token_task = Some(f.abort_handle()); } match Self::ping(&mut con).await { Ok(()) => { @@ -153,6 +162,7 @@ impl ConnectionWithCredentialsProvider { async fn get_client(&self) -> anyhow::Result { let client = redis::Client::open(self.get_connection_info().await?)?; + self.credentials_refreshed.store(true, Ordering::Relaxed); Ok(client) } @@ -168,16 +178,19 @@ impl ConnectionWithCredentialsProvider { async fn keep_connection( mut con: MultiplexedConnection, credentials_provider: Arc, - ) -> anyhow::Result<()> { + credentials_refreshed: Arc, + ) -> ! { loop { // The connection lives for 12h, for the sanity check we refresh it every hour. tokio::time::sleep(Duration::from_secs(60 * 60)).await; match Self::refresh_token(&mut con, credentials_provider.clone()).await { Ok(()) => { info!("Token refreshed"); + credentials_refreshed.store(true, Ordering::Relaxed); } Err(e) => { error!("Error during token refresh: {e:?}"); + credentials_refreshed.store(false, Ordering::Relaxed); } } } @@ -231,7 +244,7 @@ impl ConnectionLike for ConnectionWithCredentialsProvider { &'a mut self, cmd: &'a redis::Cmd, ) -> redis::RedisFuture<'a, redis::Value> { - (async move { self.send_packed_command(cmd).await }).boxed() + self.send_packed_command(cmd).boxed() } fn req_packed_commands<'a>( @@ -240,10 +253,10 @@ impl ConnectionLike for ConnectionWithCredentialsProvider { offset: usize, count: usize, ) -> redis::RedisFuture<'a, Vec> { - (async move { self.send_packed_commands(cmd, offset, count).await }).boxed() + self.send_packed_commands(cmd, offset, count).boxed() } fn get_db(&self) -> i64 { - 0 + self.con.as_ref().map_or(0, |c| c.get_db()) } } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index 671fe09b0b..cfdbc21839 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -40,6 +40,10 @@ impl RedisKVClient { .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) } + pub(crate) fn credentials_refreshed(&self) -> bool { + self.client.credentials_refreshed() + } + pub(crate) async fn query( &mut self, q: &impl Queryable, @@ -49,7 +53,7 @@ impl RedisKVClient { Err(e) => e, }; - tracing::error!("failed to run query: {e}"); + tracing::debug!("failed to run query: {e}"); match e.retry_method() { redis::RetryMethod::Reconnect => { tracing::info!("Redis client is disconnected. Reconnecting..."); diff --git a/proxy/src/redis/mod.rs b/proxy/src/redis/mod.rs index 8b46a8e6ca..4f5e24ab5f 100644 --- a/proxy/src/redis/mod.rs +++ b/proxy/src/redis/mod.rs @@ -1,4 +1,3 @@ -pub mod cancellation_publisher; pub mod connection_with_credentials_provider; pub mod elasticache; pub mod keys; diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs index e548cf3a83..fcc262f415 100644 --- a/proxy/src/sasl/channel_binding.rs +++ b/proxy/src/sasl/channel_binding.rs @@ -54,9 +54,7 @@ impl ChannelBinding { "eSws".into() } Self::Required(mode) => { - use std::io::Write; - let mut cbind_input = vec![]; - write!(&mut cbind_input, "p={mode},,",).unwrap(); + let mut cbind_input = format!("p={mode},,",).into_bytes(); cbind_input.extend_from_slice(get_cbind_data(mode)?); BASE64_STANDARD.encode(&cbind_input).into() } diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 3ba8a79368..a0918fca9f 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -107,7 +107,7 @@ pub(crate) async fn exchange( secret: &ServerSecret, password: &[u8], ) -> sasl::Result> { - let salt = BASE64_STANDARD.decode(&secret.salt_base64)?; + let salt = BASE64_STANDARD.decode(&*secret.salt_base64)?; let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; if secret.is_password_invalid(&client_key).into() { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 42039f099c..c0073917a1 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -87,13 +87,20 @@ impl<'a> ClientFirstMessage<'a> { salt_base64: &str, iterations: u32, ) -> OwnedServerFirstMessage { - use std::fmt::Write; + let mut message = String::with_capacity(128); + message.push_str("r="); - let mut message = String::new(); - write!(&mut message, "r={}", self.nonce).unwrap(); + // write combined nonce + let combined_nonce_start = message.len(); + message.push_str(self.nonce); BASE64_STANDARD.encode_string(nonce, &mut message); - let combined_nonce = 2..message.len(); - write!(&mut message, ",s={salt_base64},i={iterations}").unwrap(); + let combined_nonce = combined_nonce_start..message.len(); + + // write salt and iterations + message.push_str(",s="); + message.push_str(salt_base64); + message.push_str(",i="); + message.push_str(itoa::Buffer::new().format(iterations)); // This design guarantees that it's impossible to create a // server-first-message without receiving a client-first-message diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index f03617f34d..0e070c2f27 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -14,7 +14,7 @@ pub(crate) struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub(crate) iterations: u32, /// Salt used to hash user's password. - pub(crate) salt_base64: String, + pub(crate) salt_base64: Box, /// Hashed `ClientKey`. pub(crate) stored_key: ScramKey, /// Used by client to verify server's signature. @@ -35,7 +35,7 @@ impl ServerSecret { let secret = ServerSecret { iterations: iterations.parse().ok()?, - salt_base64: salt.to_owned(), + salt_base64: salt.into(), stored_key: base64_decode_array(stored_key)?.into(), server_key: base64_decode_array(server_key)?.into(), doomed: false, @@ -58,7 +58,7 @@ impl ServerSecret { // iteration count 1 for our generated passwords going forward. // PG16 users can set iteration count=1 already today. iterations: 1, - salt_base64: BASE64_STANDARD.encode(nonce), + salt_base64: BASE64_STANDARD.encode(nonce).into_boxed_str(), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, @@ -88,7 +88,7 @@ mod tests { let parsed = ServerSecret::parse(&secret).unwrap(); assert_eq!(parsed.iterations, iterations); - assert_eq!(parsed.salt_base64, salt); + assert_eq!(&*parsed.salt_base64, salt); assert_eq!(BASE64_STANDARD.encode(parsed.stored_key), stored_key); assert_eq!(BASE64_STANDARD.encode(parsed.server_key), server_key); diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 8f1684c75b..1aa402227f 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -137,7 +137,7 @@ impl Future for JobSpec { let state = state.as_mut().expect("should be set on thread startup"); state.tick = state.tick.wrapping_add(1); - if state.tick % SKETCH_RESET_INTERVAL == 0 { + if state.tick.is_multiple_of(SKETCH_RESET_INTERVAL) { state.countmin.reset(); } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 7708342ae3..daa6429039 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -115,7 +115,8 @@ impl PoolingBackend { match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { - self.config + let keys = self + .config .authentication_config .jwks_cache .check_jwt( @@ -129,7 +130,7 @@ impl PoolingBackend { Ok(ComputeCredentials { info: user_info.clone(), - keys: crate::auth::backend::ComputeCredentialKeys::None, + keys, }) } crate::auth::Backend::Local(_) => { @@ -256,6 +257,7 @@ impl PoolingBackend { &self, ctx: &RequestContext, conn_info: ConnInfo, + disable_pg_session_jwt: bool, ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { return Ok(client); @@ -277,7 +279,7 @@ impl PoolingBackend { .expect("semaphore should never be closed"); // check again for race - if !self.local_pool.initialized(&conn_info) { + if !self.local_pool.initialized(&conn_info) && !disable_pg_session_jwt { local_backend .compute_ctl .install_extension(&ExtensionInstallRequest { @@ -313,14 +315,16 @@ impl PoolingBackend { .to_postgres_client_config(); config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname) - .set_param( + .dbname(&conn_info.dbname); + if !disable_pg_session_jwt { + config.set_param( "options", &format!( "-c pg_session_jwt.jwk={}", serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") ), ); + } let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(&postgres_client::NoTls).await?; @@ -345,7 +349,9 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.batch_execute("select auth.init();").await { + if !disable_pg_session_jwt + && let Err(e) = client.batch_execute("select auth.init();").await + { discard.discard(); return Err(e.into()); } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index dd8cf052c5..672e59f81f 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -148,11 +148,10 @@ pub(crate) fn poll_client( } // remove from connection pool - if let Some(pool) = pool.clone().upgrade() { - if pool.write().remove_client(db_user.clone(), conn_id) { + if let Some(pool) = pool.clone().upgrade() + && pool.write().remove_client(db_user.clone(), conn_id) { info!("closed connection removed"); } - } Poll::Ready(()) }).await; diff --git a/proxy/src/serverless/error.rs b/proxy/src/serverless/error.rs index 323c91baa5..786964e764 100644 --- a/proxy/src/serverless/error.rs +++ b/proxy/src/serverless/error.rs @@ -1,5 +1,93 @@ use http::StatusCode; +use http::header::HeaderName; + +use crate::auth::ComputeUserInfoParseError; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::http::ReadBodyError; pub trait HttpCodeError { fn get_http_status_code(&self) -> StatusCode; } + +#[derive(Debug, thiserror::Error)] +pub(crate) enum ConnInfoError { + #[error("invalid header: {0}")] + InvalidHeader(&'static HeaderName), + #[error("invalid connection string: {0}")] + UrlParseError(#[from] url::ParseError), + #[error("incorrect scheme")] + IncorrectScheme, + #[error("missing database name")] + MissingDbName, + #[error("invalid database name")] + InvalidDbName, + #[error("missing username")] + MissingUsername, + #[error("invalid username: {0}")] + InvalidUsername(#[from] std::string::FromUtf8Error), + #[error("missing authentication credentials: {0}")] + MissingCredentials(Credentials), + #[error("missing hostname")] + MissingHostname, + #[error("invalid hostname: {0}")] + InvalidEndpoint(#[from] ComputeUserInfoParseError), +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Credentials { + #[error("required password")] + Password, + #[error("required authorization bearer token in JWT format")] + BearerJwt, +} + +impl ReportableError for ConnInfoError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} + +impl UserFacingError for ConnInfoError { + fn to_string_client(&self) -> String { + self.to_string() + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum ReadPayloadError { + #[error("could not read the HTTP request body: {0}")] + Read(#[from] hyper::Error), + #[error("request is too large (max is {limit} bytes)")] + BodyTooLarge { limit: usize }, + #[error("could not parse the HTTP request body: {0}")] + Parse(#[from] serde_json::Error), +} + +impl From> for ReadPayloadError { + fn from(value: ReadBodyError) -> Self { + match value { + ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, + ReadBodyError::Read(e) => Self::Read(e), + } + } +} + +impl ReportableError for ReadPayloadError { + fn get_error_kind(&self) -> ErrorKind { + match self { + ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, + ReadPayloadError::Parse(_) => ErrorKind::User, + } + } +} + +impl HttpCodeError for ReadPayloadError { + fn get_http_status_code(&self) -> StatusCode { + match self { + ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, + ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, + ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, + } + } +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 1c6574e57e..7acd816026 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -2,6 +2,8 @@ use std::collections::VecDeque; use std::sync::atomic::{self, AtomicUsize}; use std::sync::{Arc, Weak}; +use bytes::Bytes; +use http_body_util::combinators::BoxBody; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; @@ -21,8 +23,9 @@ use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -pub(crate) type Send = http2::SendRequest; -pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; +pub(crate) type Send = http2::SendRequest>; +pub(crate) type Connect = + http2::Connection, BoxBody, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); @@ -237,10 +240,10 @@ pub(crate) fn poll_http2_client( } // remove from connection pool - if let Some(pool) = pool.clone().upgrade() { - if pool.write().remove_conn(conn_id) { - info!("closed connection removed"); - } + if let Some(pool) = pool.clone().upgrade() + && pool.write().remove_conn(conn_id) + { + info!("closed connection removed"); } } .instrument(span), diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 95a28663a5..0c91ac6835 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -3,11 +3,42 @@ use anyhow::Context; use bytes::Bytes; -use http::{Response, StatusCode}; +use http::header::AUTHORIZATION; +use http::{HeaderMap, HeaderName, HeaderValue, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use serde::Serialize; +use url::Url; +use uuid::Uuid; + +use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool_lib::ConnInfo; +use super::error::{ConnInfoError, Credentials}; +use crate::auth::backend::ComputeUserInfo; +use crate::config::AuthenticationConfig; +use crate::context::RequestContext; +use crate::metrics::{Metrics, SniGroup, SniKind}; +use crate::pqproto::StartupMessageParams; +use crate::proxy::NeonOptions; +use crate::types::{DbName, EndpointId, RoleName}; + +// Common header names used across serverless modules +pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id"); +pub(super) static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); +pub(super) static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); +pub(super) static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); +pub(super) static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); +pub(super) static TXN_ISOLATION_LEVEL: HeaderName = + HeaderName::from_static("neon-batch-isolation-level"); +pub(super) static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); +pub(super) static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable"); + +pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { + let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH]; + HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..])) + .expect("uuid hyphenated format should be all valid header characters") +} /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { @@ -107,3 +138,136 @@ pub(crate) fn json_response( .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } + +pub(crate) fn get_conn_info( + config: &'static AuthenticationConfig, + ctx: &RequestContext, + connection_string: Option<&str>, + headers: &HeaderMap, +) -> Result { + let connection_url = match connection_string { + Some(connection_string) => Url::parse(connection_string)?, + None => { + let connection_string = headers + .get(&CONN_STRING) + .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; + Url::parse(connection_string)? + } + }; + + let protocol = connection_url.scheme(); + if protocol != "postgres" && protocol != "postgresql" { + return Err(ConnInfoError::IncorrectScheme); + } + + let mut url_path = connection_url + .path_segments() + .ok_or(ConnInfoError::MissingDbName)?; + + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); + ctx.set_dbname(dbname.clone()); + + let username = RoleName::from(urlencoding::decode(connection_url.username())?); + if username.is_empty() { + return Err(ConnInfoError::MissingUsername); + } + ctx.set_user(username.clone()); + // TODO: make sure this is right in the context of rest broker + let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + if !config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + } + + let auth = auth + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; + AuthData::Jwt( + auth.strip_prefix("Bearer ") + .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? + .into(), + ) + } else if let Some(pass) = connection_url.password() { + // wrong credentials provided + if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } + + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }) + } else if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } else { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + }; + let endpoint: EndpointId = match connection_url.host() { + Some(url::Host::Domain(hostname)) => hostname + .split_once('.') + .map_or(hostname, |(prefix, _)| prefix) + .into(), + Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { + return Err(ConnInfoError::MissingHostname); + } + }; + ctx.set_endpoint_id(endpoint.clone()); + + let pairs = connection_url.query_pairs(); + + let mut options = Option::None; + + let mut params = StartupMessageParams::default(); + params.insert("user", &username); + params.insert("database", &dbname); + for (key, value) in pairs { + params.insert(&key, &value); + if key == "options" { + options = Some(NeonOptions::parse_options_raw(&value)); + } + } + + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + + let user_info = ComputeUserInfo { + endpoint, + user: username, + options: options.unwrap_or_default(), + }; + + let conn_info = ConnInfo { user_info, dbname }; + Ok(ConnInfoWithAuth { conn_info, auth }) +} diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 1afc10359f..2e67d07079 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -70,6 +70,34 @@ pub(crate) enum JsonConversionError { ParseJsonError(#[from] serde_json::Error), #[error("unbalanced array")] UnbalancedArray, + #[error("unbalanced quoted string")] + UnbalancedString, +} + +enum OutputMode { + Array(Vec), + Object(Map), +} + +impl OutputMode { + fn key(&mut self, key: &str) -> &mut Value { + match self { + OutputMode::Array(values) => push_entry(values, Value::Null), + OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null), + } + } + + fn finish(self) -> Value { + match self { + OutputMode::Array(values) => Value::Array(values), + OutputMode::Object(map) => Value::Object(map), + } + } +} + +fn push_entry(arr: &mut Vec, t: T) -> &mut T { + arr.push(t); + arr.last_mut().expect("a value was just inserted") } // @@ -77,182 +105,277 @@ pub(crate) enum JsonConversionError { // pub(crate) fn pg_text_row_to_json( row: &Row, - columns: &[Type], raw_output: bool, array_mode: bool, ) -> Result { - let iter = row - .columns() - .iter() - .zip(columns) - .enumerate() - .map(|(i, (column, typ))| { - let name = column.name(); - let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; - let json_value = if raw_output { - match pg_value { - Some(v) => Value::String(v.to_string()), - None => Value::Null, - } - } else { - pg_text_to_json(pg_value, typ)? - }; - Ok((name.to_string(), json_value)) - }); - - if array_mode { - // drop keys and aggregate into array - let arr = iter - .map(|r| r.map(|(_key, val)| val)) - .collect::, JsonConversionError>>()?; - Ok(Value::Array(arr)) + let mut entries = if array_mode { + OutputMode::Array(Vec::with_capacity(row.columns().len())) } else { - let obj = iter.collect::, JsonConversionError>>()?; - Ok(Value::Object(obj)) + OutputMode::Object(Map::with_capacity(row.columns().len())) + }; + + for (i, column) in row.columns().iter().enumerate() { + let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; + + let value = entries.key(column.name()); + + match pg_value { + Some(v) if raw_output => *value = Value::String(v.to_string()), + Some(v) => pg_text_to_json(value, v, column.type_())?, + None => *value = Value::Null, + } } + + Ok(entries.finish()) } // // Convert postgres text-encoded value to JSON value // -fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { - if let Some(val) = pg_value { - if let Kind::Array(elem_type) = pg_type.kind() { - return pg_array_parse(val, elem_type); - } +fn pg_text_to_json( + output: &mut Value, + val: &str, + pg_type: &Type, +) -> Result<(), JsonConversionError> { + if let Kind::Array(elem_type) = pg_type.kind() { + // todo: we should fetch this from postgres. + let delimiter = ','; - match *pg_type { - Type::BOOL => Ok(Value::Bool(val == "t")), - Type::INT2 | Type::INT4 => { - let val = val.parse::()?; - Ok(Value::Number(serde_json::Number::from(val))) - } - Type::FLOAT4 | Type::FLOAT8 => { - let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - Ok(Value::Number(num)) - } else { - // Pass Nan, Inf, -Inf as strings - // JS JSON.stringify() does converts them to null, but we - // want to preserve them, so we pass them as strings - Ok(Value::String(val.to_string())) - } - } - Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), - _ => Ok(Value::String(val.to_string())), - } - } else { - Ok(Value::Null) - } -} - -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// -fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { - pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v) -} - -fn pg_array_parse_inner( - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result<(Value, usize), JsonConversionError> { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entries: Vec = Vec::new(); - let mut entry = String::new(); - - // skip bounds decoration - if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } + let mut array = vec![]; + pg_array_parse(&mut array, val, elem_type, delimiter)?; + *output = Value::Array(array); + return Ok(()); } - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), JsonConversionError> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(pg_text_to_json(None, elem_type)?); + match *pg_type { + Type::BOOL => *output = Value::Bool(val == "t"), + Type::INT2 | Type::INT4 => { + let val = val.parse::()?; + *output = Value::Number(serde_json::Number::from(val)); + } + Type::FLOAT4 | Type::FLOAT8 => { + let fval = val.parse::()?; + let num = serde_json::Number::from_f64(fval); + if let Some(num) = num { + *output = Value::Number(num); } else { - entries.push(pg_text_to_json(Some(entry), elem_type)?); + // Pass Nan, Inf, -Inf as strings + // JS JSON.stringify() does converts them to null, but we + // want to preserve them, so we pass them as strings + *output = Value::String(val.to_string()); } - entry.clear(); } - - Ok(()) + Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?, + _ => *output = Value::String(val.to_string()), } - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; + Ok(()) +} - if c == '\\' { - escaped = true; - let Some(x) = pg_array_chr.next() else { - return Err(JsonConversionError::UnbalancedArray); - }; - (i, c) = x; - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?; - entries.push(res); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, &mut entries, elem_type)?; - if nested { - return Ok((Value::Array(entries), i)); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, &mut entries, elem_type)?; - } - _ => { - entry.push(c); - } - } +/// Parse postgres array into JSON array. +/// +/// This is a bit involved because we need to handle nested arrays and quoted +/// values. Unlike postgres we don't check that all nested arrays have the same +/// dimensions, we just return them as is. +/// +/// +/// +/// The external text representation of an array value consists of items that are interpreted +/// according to the I/O conversion rules for the array's element type, plus decoration that +/// indicates the array structure. The decoration consists of curly braces (`{` and `}`) around +/// the array value plus delimiter characters between adjacent items. The delimiter character +/// is usually a comma (,) but can be something else: it is determined by the typdelim setting +/// for the array's element type. Among the standard data types provided in the PostgreSQL +/// distribution, all use a comma, except for type box, which uses a semicolon (;). +/// +/// In a multidimensional array, each dimension (row, plane, cube, etc.) +/// gets its own level of curly braces, and delimiters must be written between adjacent +/// curly-braced entities of the same level. +fn pg_array_parse( + elements: &mut Vec, + mut pg_array: &str, + elem: &Type, + delim: char, +) -> Result<(), JsonConversionError> { + // skip bounds decoration, eg: + // `[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}` + // technically these are significant, but we have no way to represent them in json. + if let Some('[') = pg_array.chars().next() { + let Some((_bounds, array)) = pg_array.split_once('=') else { + return Err(JsonConversionError::UnbalancedArray); + }; + pg_array = array; } - if level != 0 { + // whitespace might preceed a `{`. + let pg_array = pg_array.trim_start(); + + let rest = pg_array_parse_inner(elements, pg_array, elem, delim)?; + if !rest.is_empty() { return Err(JsonConversionError::UnbalancedArray); } - Ok((Value::Array(entries), 0)) + Ok(()) +} + +/// reads a single array from the `pg_array` string and pushes each values to `elements`. +/// returns the rest of the `pg_array` string that was not read. +fn pg_array_parse_inner<'a>( + elements: &mut Vec, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // array should have a `{` prefix. + pg_array = pg_array + .strip_prefix('{') + .ok_or(JsonConversionError::UnbalancedArray)?; + + let mut q = String::new(); + + loop { + let value = push_entry(elements, Value::Null); + pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?; + + // check for separator. + if let Some(next) = pg_array.strip_prefix(delim) { + // next item. + pg_array = next; + } else { + break; + } + } + + let Some(next) = pg_array.strip_prefix('}') else { + // missing `}` terminator. + return Err(JsonConversionError::UnbalancedArray); + }; + + // whitespace might follow a `}`. + Ok(next.trim_start()) +} + +/// reads a single item from the `pg_array` string. +/// returns the rest of the `pg_array` string that was not read. +/// +/// `quoted` is a scratch allocation that has no defined output. +fn pg_array_parse_item<'a>( + output: &mut Value, + quoted: &mut String, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // We are trying to parse an array item. + // This could be a new array, if this is a multi-dimentional array. + // This could be a quoted string representing `elem`. + // This could be an unquoted string representing `elem`. + + // whitespace might preceed an item. + pg_array = pg_array.trim_start(); + + if pg_array.starts_with('{') { + // nested array. + let mut nested = vec![]; + pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?; + *output = Value::Array(nested); + return Ok(pg_array); + } + + if let Some(mut pg_array) = pg_array.strip_prefix('"') { + // the parsed string is un-escaped and written into quoted. + pg_array = pg_array_parse_quoted(quoted, pg_array)?; + + // we have un-escaped the string, parse it as pgtext. + pg_text_to_json(output, quoted, elem)?; + + return Ok(pg_array); + } + + // we need to parse an item. read until we find a delimiter or `}`. + let index = pg_array + .find([delim, '}']) + .ok_or(JsonConversionError::UnbalancedArray)?; + + let item; + (item, pg_array) = pg_array.split_at(index); + + // item might have trailing whitespace that we need to ignore. + let item = item.trim_end(); + + // we might have an item string: + // check for null + if item == "NULL" { + *output = Value::Null; + } else { + pg_text_to_json(output, item, elem)?; + } + + Ok(pg_array) +} + +/// reads a single quoted item from the `pg_array` string. +/// +/// Returns the rest of the `pg_array` string that was not read. +/// The output is written into `quoted`. +/// +/// The pg_array string must have a `"` terminator, but the `"` initial value +/// must have already been removed from the input. The terminator is removed. +fn pg_array_parse_quoted<'a>( + quoted: &mut String, + mut pg_array: &'a str, +) -> Result<&'a str, JsonConversionError> { + // The array output routine will put double quotes around element values if they are empty strings, + // contain curly braces, delimiter characters, double quotes, backslashes, or white space, + // or match the word `NULL`. Double quotes and backslashes embedded in element values will be backslash-escaped. + // For numeric data types it is safe to assume that double quotes will never appear, + // but for textual data types one should be prepared to cope with either the presence or absence of quotes. + + quoted.clear(); + + // We write to quoted in chunks terminated by an escape character. + // Eg if we have the input `foo\"bar"`, then we write `foo`, then `"`, then finally `bar`. + + loop { + // we need to parse an chunk. read until we find a '\\' or `"`. + let i = pg_array + .find(['\\', '"']) + .ok_or(JsonConversionError::UnbalancedString)?; + + let chunk: &str; + (chunk, pg_array) = pg_array + .split_at_checked(i) + .expect("i is guaranteed to be in-bounds of pg_array"); + + // push the chunk. + quoted.push_str(chunk); + + // consume the chunk_end character. + let chunk_end: char; + (chunk_end, pg_array) = + split_first_char(pg_array).expect("pg_array should start with either '\\\\' or '\"'"); + + // finished. + if chunk_end == '"' { + // whitespace might follow the '"'. + pg_array = pg_array.trim_start(); + + break Ok(pg_array); + } + + // consume the escaped character. + let escaped: char; + (escaped, pg_array) = + split_first_char(pg_array).ok_or(JsonConversionError::UnbalancedString)?; + + quoted.push(escaped); + } +} + +fn split_first_char(s: &str) -> Option<(char, &str)> { + let mut chars = s.chars(); + let c = chars.next()?; + Some((c, chars.as_str())) } #[cfg(test)] @@ -316,37 +439,33 @@ mod tests { ); } + fn pg_text_to_json(val: &str, pg_type: &Type) -> Value { + let mut v = Value::Null; + super::pg_text_to_json(&mut v, val, pg_type).unwrap(); + v + } + + fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value { + let mut array = vec![]; + super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap(); + Value::Array(array) + } + #[test] fn test_atomic_types_parse() { + assert_eq!(pg_text_to_json("foo", &Type::TEXT), json!("foo")); + assert_eq!(pg_text_to_json("42", &Type::INT4), json!(42)); + assert_eq!(pg_text_to_json("42", &Type::INT2), json!(42)); + assert_eq!(pg_text_to_json("42", &Type::INT8), json!("42")); + assert_eq!(pg_text_to_json("42.42", &Type::FLOAT8), json!(42.42)); + assert_eq!(pg_text_to_json("42.42", &Type::FLOAT4), json!(42.42)); + assert_eq!(pg_text_to_json("NaN", &Type::FLOAT4), json!("NaN")); assert_eq!( - pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), - json!("foo") - ); - assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); - assert_eq!( - pg_text_to_json(Some("42"), &Type::INT8).unwrap(), - json!("42") - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), - json!("NaN") - ); - assert_eq!( - pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), + pg_text_to_json("Infinity", &Type::FLOAT4), json!("Infinity") ); assert_eq!( - pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), + pg_text_to_json("-Infinity", &Type::FLOAT4), json!("-Infinity") ); @@ -355,10 +474,9 @@ mod tests { .unwrap(); assert_eq!( pg_text_to_json( - Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), + r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#, &Type::JSONB - ) - .unwrap(), + ), json ); } @@ -366,7 +484,7 @@ mod tests { #[test] fn test_pg_array_parse_text() { fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::TEXT).unwrap() + pg_array_parse(pg_arr, &Type::TEXT) } assert_eq!( pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), @@ -389,7 +507,7 @@ mod tests { #[test] fn test_pg_array_parse_bool() { fn pb(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::BOOL).unwrap() + pg_array_parse(pg_arr, &Type::BOOL) } assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); @@ -406,7 +524,7 @@ mod tests { #[test] fn test_pg_array_parse_numbers() { fn pn(pg_arr: &str, ty: &Type) -> Value { - pg_array_parse(pg_arr, ty).unwrap() + pg_array_parse(pg_arr, ty) } assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); @@ -434,7 +552,7 @@ mod tests { #[test] fn test_pg_array_with_decoration() { fn p(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::INT2).unwrap() + pg_array_parse(pg_arr, &Type::INT2) } assert_eq!( p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), @@ -445,7 +563,7 @@ mod tests { #[test] fn test_pg_array_parse_json() { fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::JSONB).unwrap() + pg_array_parse(pg_arr, &Type::JSONB) } assert_eq!(pt(r#"{"{}"}"#), json!([{}])); assert_eq!( diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index c367615fb8..e4cbd02bfe 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -249,11 +249,10 @@ pub(crate) fn poll_client( } // remove from connection pool - if let Some(pool) = pool.clone().upgrade() { - if pool.global_pool.write().remove_client(db_user.clone(), conn_id) { + if let Some(pool) = pool.clone().upgrade() + && pool.global_pool.write().remove_client(db_user.clone(), conn_id) { info!("closed connection removed"); } - } Poll::Ready(()) }).await; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index d8942bb814..5b7289c53d 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -29,13 +29,13 @@ use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; +use http_util::{NEON_REQUEST_ID, uuid_to_header_value}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; use rand::SeedableRng; use rand::rngs::StdRng; -use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5d5e7bf83e..7a718d0280 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -11,7 +11,7 @@ use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; -use hyper::{HeaderMap, Request, Response, StatusCode, header}; +use hyper::{Request, Response, StatusCode, header}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{ @@ -24,26 +24,23 @@ use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{Level, debug, error, info}; use typed_json::json; -use url::Url; -use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; -use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool::AuthData; use super::conn_pool_lib::{self, ConnInfo}; -use super::error::HttpCodeError; -use super::http_util::json_response; +use super::error::{ConnInfoError, HttpCodeError, ReadPayloadError}; +use super::http_util::{ + ALLOW_POOL, ARRAY_MODE, CONN_STRING, NEON_REQUEST_ID, RAW_TEXT_OUTPUT, TXN_DEFERRABLE, + TXN_ISOLATION_LEVEL, TXN_READ_ONLY, get_conn_info, json_response, uuid_to_header_value, +}; use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; -use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::{ComputeUserInfoParseError, endpoint_sni}; -use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::config::{HttpConfig, ProxyConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; -use crate::pqproto::StartupMessageParams; -use crate::proxy::NeonOptions; +use crate::http::read_body_with_limit; +use crate::metrics::{HttpDirection, Metrics}; use crate::serverless::backend::HttpConnError; -use crate::types::{DbName, RoleName}; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; use crate::util::run_until_cancelled; @@ -70,16 +67,6 @@ enum Payload { Batch(BatchQueryData), } -pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id"); - -static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); -static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); -static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); -static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); -static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level"); -static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); -static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable"); - static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> @@ -91,188 +78,6 @@ where Ok(json_to_pg_text(json)) } -#[derive(Debug, thiserror::Error)] -pub(crate) enum ConnInfoError { - #[error("invalid header: {0}")] - InvalidHeader(&'static HeaderName), - #[error("invalid connection string: {0}")] - UrlParseError(#[from] url::ParseError), - #[error("incorrect scheme")] - IncorrectScheme, - #[error("missing database name")] - MissingDbName, - #[error("invalid database name")] - InvalidDbName, - #[error("missing username")] - MissingUsername, - #[error("invalid username: {0}")] - InvalidUsername(#[from] std::string::FromUtf8Error), - #[error("missing authentication credentials: {0}")] - MissingCredentials(Credentials), - #[error("missing hostname")] - MissingHostname, - #[error("invalid hostname: {0}")] - InvalidEndpoint(#[from] ComputeUserInfoParseError), - #[error("malformed endpoint")] - MalformedEndpoint, -} - -#[derive(Debug, thiserror::Error)] -pub(crate) enum Credentials { - #[error("required password")] - Password, - #[error("required authorization bearer token in JWT format")] - BearerJwt, -} - -impl ReportableError for ConnInfoError { - fn get_error_kind(&self) -> ErrorKind { - ErrorKind::User - } -} - -impl UserFacingError for ConnInfoError { - fn to_string_client(&self) -> String { - self.to_string() - } -} - -fn get_conn_info( - config: &'static AuthenticationConfig, - ctx: &RequestContext, - headers: &HeaderMap, - tls: Option<&TlsConfig>, -) -> Result { - let connection_string = headers - .get(&CONN_STRING) - .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? - .to_str() - .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; - - let connection_url = Url::parse(connection_string)?; - - let protocol = connection_url.scheme(); - if protocol != "postgres" && protocol != "postgresql" { - return Err(ConnInfoError::IncorrectScheme); - } - - let mut url_path = connection_url - .path_segments() - .ok_or(ConnInfoError::MissingDbName)?; - - let dbname: DbName = - urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); - ctx.set_dbname(dbname.clone()); - - let username = RoleName::from(urlencoding::decode(connection_url.username())?); - if username.is_empty() { - return Err(ConnInfoError::MissingUsername); - } - ctx.set_user(username.clone()); - - let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { - if !config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::Password)); - } - - let auth = auth - .to_str() - .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; - AuthData::Jwt( - auth.strip_prefix("Bearer ") - .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? - .into(), - ) - } else if let Some(pass) = connection_url.password() { - // wrong credentials provided - if config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); - } - - AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { - std::borrow::Cow::Borrowed(b) => b.into(), - std::borrow::Cow::Owned(b) => b.into(), - }) - } else if config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); - } else { - return Err(ConnInfoError::MissingCredentials(Credentials::Password)); - }; - - let endpoint = match connection_url.host() { - Some(url::Host::Domain(hostname)) => { - if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? - } else { - hostname - .split_once('.') - .map_or(hostname, |(prefix, _)| prefix) - .into() - } - } - Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { - return Err(ConnInfoError::MissingHostname); - } - }; - ctx.set_endpoint_id(endpoint.clone()); - - let pairs = connection_url.query_pairs(); - - let mut options = Option::None; - - let mut params = StartupMessageParams::default(); - params.insert("user", &username); - params.insert("database", &dbname); - for (key, value) in pairs { - params.insert(&key, &value); - if key == "options" { - options = Some(NeonOptions::parse_options_raw(&value)); - } - } - - // check the URL that was used, for metrics - { - let host_endpoint = headers - // get the host header - .get("host") - // extract the domain - .and_then(|h| { - let (host, _port) = h.to_str().ok()?.split_once(':')?; - Some(host) - }) - // get the endpoint prefix - .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); - - let kind = if host_endpoint == Some(&*endpoint) { - SniKind::Sni - } else { - SniKind::NoSni - }; - - let protocol = ctx.protocol(); - Metrics::get() - .proxy - .accepted_connections_by_sni - .inc(SniGroup { protocol, kind }); - } - - ctx.set_user_agent( - headers - .get(hyper::header::USER_AGENT) - .and_then(|h| h.to_str().ok()) - .map(Into::into), - ); - - let user_info = ComputeUserInfo { - endpoint, - user: username, - options: options.unwrap_or_default(), - }; - - let conn_info = ConnInfo { user_info, dbname }; - Ok(ConnInfoWithAuth { conn_info, auth }) -} - pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestContext, @@ -541,45 +346,6 @@ impl HttpCodeError for SqlOverHttpError { } } -#[derive(Debug, thiserror::Error)] -pub(crate) enum ReadPayloadError { - #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), - #[error("request is too large (max is {limit} bytes)")] - BodyTooLarge { limit: usize }, - #[error("could not parse the HTTP request body: {0}")] - Parse(#[from] serde_json::Error), -} - -impl From> for ReadPayloadError { - fn from(value: ReadBodyError) -> Self { - match value { - ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, - ReadBodyError::Read(e) => Self::Read(e), - } - } -} - -impl ReportableError for ReadPayloadError { - fn get_error_kind(&self) -> ErrorKind { - match self { - ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, - ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, - ReadPayloadError::Parse(_) => ErrorKind::User, - } - } -} - -impl HttpCodeError for ReadPayloadError { - fn get_http_status_code(&self) -> StatusCode { - match self { - ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, - ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, - ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, - } - } -} - #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] @@ -670,14 +436,7 @@ async fn handle_inner( "handling interactive connection from client" ); - let conn_info = get_conn_info( - &config.authentication_config, - ctx, - request.headers(), - // todo: race condition? - // we're unlikely to change the common names. - config.tls_config.load().as_deref(), - )?; + let conn_info = get_conn_info(&config.authentication_config, ctx, None, request.headers())?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" @@ -763,9 +522,17 @@ async fn handle_db_inner( ComputeCredentialKeys::JwtPayload(payload) if backend.auth_backend.is_local_proxy() => { - let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; - let (cli_inner, _dsc) = client.client_inner(); - cli_inner.set_jwt_session(&payload).await?; + #[cfg(feature = "testing")] + let disable_pg_session_jwt = config.disable_pg_session_jwt; + #[cfg(not(feature = "testing"))] + let disable_pg_session_jwt = false; + let mut client = backend + .connect_to_local_postgres(ctx, conn_info, disable_pg_session_jwt) + .await?; + if !disable_pg_session_jwt { + let (cli_inner, _dsc) = client.client_inner(); + cli_inner.set_jwt_session(&payload).await?; + } Client::Local(client) } _ => { @@ -864,12 +631,6 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[ &TXN_DEFERRABLE, ]; -pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { - let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH]; - HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..])) - .expect("uuid hyphenated format should be all valid header characters") -} - async fn handle_auth_broker_inner( ctx: &RequestContext, request: Request, @@ -899,7 +660,7 @@ async fn handle_auth_broker_inner( req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())); let req = req - .body(body) + .body(body.map_err(|e| e).boxed()) //TODO: is there a potential for a regression here? .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress @@ -1135,7 +896,6 @@ async fn query_to_json( let columns_len = row_stream.statement.columns().len(); let mut fields = Vec::with_capacity(columns_len); - let mut types = Vec::with_capacity(columns_len); for c in row_stream.statement.columns() { fields.push(json!({ @@ -1147,8 +907,6 @@ async fn query_to_json( "dataTypeModifier": c.type_modifier(), "format": "text", })); - - types.push(c.type_().clone()); } let raw_output = parsed_headers.raw_output; @@ -1170,7 +928,7 @@ async fn query_to_json( )); } - let row = pg_text_row_to_json(&row, &types, raw_output, array_mode)?; + let row = pg_text_row_to_json(&row, raw_output, array_mode)?; rows.push(row); // assumption: parsing pg text and converting to json takes CPU time. diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index c49a431c95..4e55654515 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -199,27 +199,27 @@ impl PqStream { let probe_msg; let mut msg = &*msg; - if let Some(ctx) = ctx { - if ctx.get_testodrome_id().is_some() { - let tag = match error_kind { - ErrorKind::User => "client", - ErrorKind::ClientDisconnect => "client", - ErrorKind::RateLimit => "proxy", - ErrorKind::ServiceRateLimit => "proxy", - ErrorKind::Quota => "proxy", - ErrorKind::Service => "proxy", - ErrorKind::ControlPlane => "controlplane", - ErrorKind::Postgres => "other", - ErrorKind::Compute => "compute", - }; - probe_msg = typed_json::json!({ - "tag": tag, - "msg": msg, - "cold_start_info": ctx.cold_start_info(), - }) - .to_string(); - msg = &probe_msg; - } + if let Some(ctx) = ctx + && ctx.get_testodrome_id().is_some() + { + let tag = match error_kind { + ErrorKind::User => "client", + ErrorKind::ClientDisconnect => "client", + ErrorKind::RateLimit => "proxy", + ErrorKind::ServiceRateLimit => "proxy", + ErrorKind::Quota => "proxy", + ErrorKind::Service => "proxy", + ErrorKind::ControlPlane => "controlplane", + ErrorKind::Postgres => "other", + ErrorKind::Compute => "compute", + }; + probe_msg = typed_json::json!({ + "tag": tag, + "msg": msg, + "cold_start_info": ctx.cold_start_info(), + }) + .to_string(); + msg = &probe_msg; } // TODO: either preserve the error code from postgres, or assign error codes to proxy errors. diff --git a/proxy/src/types.rs b/proxy/src/types.rs index d5952d1d8b..43b8dc5b29 100644 --- a/proxy/src/types.rs +++ b/proxy/src/types.rs @@ -107,13 +107,3 @@ smol_str_wrapper!(DbName); // postgres hostname, will likely be a port:ip addr smol_str_wrapper!(Host); - -// Endpoints are a bit tricky. Rare they might be branches or projects. -impl EndpointId { - pub(crate) fn is_endpoint(&self) -> bool { - self.0.starts_with("ep-") - } - pub(crate) fn is_branch(&self) -> bool { - self.0.starts_with("br-") - } -} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8d31ada24f..8fda625817 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -18,9 +18,10 @@ use metrics::set_build_info_metric; use remote_storage::RemoteStorageConfig; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, - DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, - DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, + DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES, + DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; use safekeeper::wal_backup::WalBackup; use safekeeper::{ @@ -138,6 +139,15 @@ struct Args { /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)] max_offloader_lag: u64, + /* BEGIN_HADRON */ + /// Safekeeper will re-elect a new offloader if the current backup lagging for more than this value in bytes + #[arg(long, default_value_t = DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES)] + max_reelect_offloader_lag_bytes: u64, + /// Safekeeper will stop accepting new WALs if the timeline disk usage exceeds this value in bytes. + /// Setting this value to 0 disables the limit. + #[arg(long, default_value_t = DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES)] + max_timeline_disk_usage_bytes: u64, + /* END_HADRON */ /// Number of max parallel WAL segments to be offloaded to remote storage. #[arg(long, default_value = "5")] wal_backup_parallel_jobs: usize, @@ -391,6 +401,10 @@ async fn main() -> anyhow::Result<()> { peer_recovery_enabled: args.peer_recovery, remote_storage: args.remote_storage, max_offloader_lag_bytes: args.max_offloader_lag, + /* BEGIN_HADRON */ + max_reelect_offloader_lag_bytes: args.max_reelect_offloader_lag_bytes, + max_timeline_disk_usage_bytes: args.max_timeline_disk_usage_bytes, + /* END_HADRON */ wal_backup_enabled: !args.disable_wal_backup, backup_parallel_jobs: args.wal_backup_parallel_jobs, pg_auth, diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 4fc62fb229..76c2223891 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -17,6 +17,7 @@ use utils::crashsafe::durable_rename; use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file}; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; +use crate::metrics::WAL_DISK_IO_ERRORS; use crate::state::{EvictionState, TimelinePersistentState}; pub const SK_MAGIC: u32 = 0xcafeceefu32; @@ -192,11 +193,14 @@ impl TimelinePersistentState { impl Storage for FileStorage { /// Persists state durably to the underlying storage. async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { + // start timer for metrics let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); - // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); let mut control_partial = File::create(&control_partial_path).await.with_context(|| { + /* BEGIN_HADRON */ + WAL_DISK_IO_ERRORS.inc(); + /*END_HADRON */ format!( "failed to create partial control file at: {}", &control_partial_path @@ -206,14 +210,24 @@ impl Storage for FileStorage { let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { + /* BEGIN_HADRON */ + WAL_DISK_IO_ERRORS.inc(); + /*END_HADRON */ format!("failed to write safekeeper state into control file at: {control_partial_path}") })?; control_partial.flush().await.with_context(|| { + /* BEGIN_HADRON */ + WAL_DISK_IO_ERRORS.inc(); + /*END_HADRON */ format!("failed to flush safekeeper state into control file at: {control_partial_path}") })?; let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); - durable_rename(&control_partial_path, &control_path, !self.no_sync).await?; + durable_rename(&control_partial_path, &control_path, !self.no_sync) + .await + /* BEGIN_HADRON */ + .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?; + /* END_HADRON */ // update internal state self.state = s.clone(); diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index b4d9cadd6d..c461c071da 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -61,6 +61,13 @@ pub mod defaults { pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); + /* BEGIN_HADRON */ + // Default leader re-elect is 0(disabled). SK will re-elect leader if the current leader is lagging this many bytes. + pub const DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES: u64 = 0; + // Default disk usage limit is 0 (disabled). It means each timeline by default can use up to this many WAL + // disk space on this SK until SK begins to reject WALs. + pub const DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES: u64 = 0; + /* END_HADRON */ pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s"; pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5"; @@ -99,6 +106,10 @@ pub struct SafeKeeperConf { pub peer_recovery_enabled: bool, pub remote_storage: Option, pub max_offloader_lag_bytes: u64, + /* BEGIN_HADRON */ + pub max_reelect_offloader_lag_bytes: u64, + pub max_timeline_disk_usage_bytes: u64, + /* END_HADRON */ pub backup_parallel_jobs: usize, pub wal_backup_enabled: bool, pub pg_auth: Option>, @@ -151,6 +162,10 @@ impl SafeKeeperConf { sk_auth_token: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, + /* BEGIN_HADRON */ + max_reelect_offloader_lag_bytes: defaults::DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, + max_timeline_disk_usage_bytes: defaults::DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES, + /* END_HADRON */ current_thread_runtime: false, walsenders_keep_horizon: false, partial_backup_timeout: Duration::from_secs(0), diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index cb21a5f6d2..9baa80f73a 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -58,6 +58,25 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); +/* BEGIN_HADRON */ +pub static WAL_DISK_IO_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_wal_disk_io_errors", + "Number of disk I/O errors when creating and flushing WALs and control files" + ) + .expect("Failed to register safekeeper_wal_disk_io_errors counter") +}); +pub static WAL_STORAGE_LIMIT_ERRORS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_wal_storage_limit_errors", + concat!( + "Number of errors due to timeline WAL storage utilization exceeding configured limit. ", + "An increase in this metric indicates issues backing up or removing WALs." + ) + ) + .expect("Failed to register safekeeper_wal_storage_limit_errors counter") +}); +/* END_HADRON */ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_persist_control_file_seconds", @@ -138,6 +157,15 @@ pub static BACKUP_ERRORS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_backup_errors_total counter") }); +/* BEGIN_HADRON */ +pub static BACKUP_REELECT_LEADER_COUNT: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_backup_reelect_leader_total", + "Number of times the backup leader was reelected" + ) + .expect("Failed to register safekeeper_backup_reelect_leader_total counter") +}); +/* END_HADRON */ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_broker_push_update_seconds", diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 2192f5eab4..72a436e25f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -16,7 +16,7 @@ use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; use tracing::{Instrument, error, info, info_span}; -use utils::critical; +use utils::critical_timeline; use utils::lsn::Lsn; use utils::postgres_client::{Compression, InterpretedFormat}; use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords}; @@ -268,6 +268,8 @@ impl InterpretedWalReader { let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel(); + let ttid = wal_stream.ttid; + let reader = InterpretedWalReader { wal_stream, shard_senders: HashMap::from([( @@ -300,7 +302,11 @@ impl InterpretedWalReader { .inspect_err(|err| match err { // TODO: we may want to differentiate these errors further. InterpretedWalReaderError::Decode(_) => { - critical!("failed to decode WAL record: {err:?}"); + critical_timeline!( + ttid.tenant_id, + ttid.timeline_id, + "failed to read WAL record: {err:?}" + ); } err => error!("failed to read WAL record: {err}"), }) @@ -363,9 +369,14 @@ impl InterpretedWalReader { metric.dec(); } + let ttid = self.wal_stream.ttid; match self.run_impl(start_pos).await { Err(err @ InterpretedWalReaderError::Decode(_)) => { - critical!("failed to decode WAL record: {err:?}"); + critical_timeline!( + ttid.tenant_id, + ttid.timeline_id, + "failed to decode WAL record: {err:?}" + ); } Err(err) => error!("failed to read WAL record: {err}"), Ok(()) => info!("interpreted wal reader exiting"), @@ -550,6 +561,20 @@ impl InterpretedWalReader { // Update internal and external state, then reset the WAL stream // if required. let senders = self.shard_senders.entry(shard_id).or_default(); + + // Clean up any shard senders that have dropped out before adding the new + // one. This avoids a build up of dead senders. + senders.retain(|sender| { + let closed = sender.tx.is_closed(); + + if closed { + let sender_id = ShardSenderId::new(shard_id, sender.sender_id); + tracing::info!("Removed shard sender {}", sender_id); + } + + !closed + }); + let new_sender_id = match senders.last() { Some(sender) => sender.sender_id.next(), None => SenderId::first() diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index cf38019f66..b52ed84e53 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -26,7 +26,9 @@ use utils::id::{NodeId, TenantId, TenantTimelineId}; use utils::lsn::Lsn; use utils::sync::gate::Gate; -use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics}; +use crate::metrics::{ + FullTimelineInfo, MISC_OPERATION_SECONDS, WAL_STORAGE_LIMIT_ERRORS, WalStorageMetrics, +}; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; @@ -195,7 +197,7 @@ impl StateSK { Ok(TimelineMembershipSwitchResponse { previous_conf: result.previous_conf, current_conf: result.current_conf, - term: self.state().acceptor_state.term, + last_log_term: self.state().acceptor_state.term, flush_lsn: self.flush_lsn(), }) } @@ -1047,6 +1049,39 @@ impl WalResidentTimeline { Ok(ss) } + // BEGIN HADRON + // Check if disk usage by WAL segment files for this timeline exceeds the configured limit. + fn hadron_check_disk_usage( + &self, + shared_state_locked: &mut WriteGuardSharedState<'_>, + ) -> Result<()> { + // The disk usage is calculated based on the number of segments between `last_removed_segno` + // and the current flush LSN segment number. `last_removed_segno` is advanced after + // unneeded WAL files are physically removed from disk (see `update_wal_removal_end()` + // in `timeline_manager.rs`). + let max_timeline_disk_usage_bytes = self.conf.max_timeline_disk_usage_bytes; + if max_timeline_disk_usage_bytes > 0 { + let last_removed_segno = self.last_removed_segno.load(Ordering::Relaxed); + let flush_lsn = shared_state_locked.sk.flush_lsn(); + let wal_seg_size = shared_state_locked.sk.state().server.wal_seg_size as u64; + let current_segno = flush_lsn.segment_number(wal_seg_size as usize); + + let segno_count = current_segno - last_removed_segno; + let disk_usage_bytes = segno_count * wal_seg_size; + + if disk_usage_bytes > max_timeline_disk_usage_bytes { + WAL_STORAGE_LIMIT_ERRORS.inc(); + bail!( + "WAL storage utilization exceeds configured limit of {} bytes: current disk usage: {} bytes", + max_timeline_disk_usage_bytes, + disk_usage_bytes + ); + } + } + Ok(()) + } + // END HADRON + /// Pass arrived message to the safekeeper. pub async fn process_msg( &self, @@ -1059,6 +1094,13 @@ impl WalResidentTimeline { let mut rmsg: Option; { let mut shared_state = self.write_shared_state().await; + // BEGIN HADRON + // Errors from the `hadron_check_disk_usage()` function fail the process_msg() function, which + // gets propagated upward and terminates the entire WalAcceptor. This will cause postgres to + // disconnect from the safekeeper and reestablish another connection. Postgres will keep retrying + // safekeeper connections every second until it can successfully propose WAL to the SK again. + self.hadron_check_disk_usage(&mut shared_state)?; + // END HADRON rmsg = shared_state.sk.safekeeper().process_msg(msg).await?; // if this is AppendResponse, fill in proper hot standby feedback. diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0beb272a60..7e10847a1b 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -26,7 +26,9 @@ use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; use utils::{backoff, pausable_failpoint}; -use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; +use crate::metrics::{ + BACKED_UP_SEGMENTS, BACKUP_ERRORS, BACKUP_REELECT_LEADER_COUNT, WAL_BACKUP_TASKS, +}; use crate::timeline::WalResidentTimeline; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; @@ -70,8 +72,9 @@ pub(crate) async fn update_task( need_backup: bool, state: &StateSnapshot, ) { - let (offloader, election_dbg_str) = - determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); + /* BEGIN_HADRON */ + let (offloader, election_dbg_str) = hadron_determine_offloader(mgr, state); + /* END_HADRON */ let elected_me = Some(mgr.conf.my_id) == offloader; let should_task_run = need_backup && elected_me; @@ -127,6 +130,70 @@ async fn shut_down_task(entry: &mut Option) { } } +/* BEGIN_HADRON */ +// On top of the neon determine_offloader, we also check if the current offloader is lagging behind too much. +// If it is, we re-elect a new offloader. This mitigates the below issue. It also helps distribute the load across SKs. +// +// We observe that the offloader fails to upload a segment due to race conditions on XLOG SWITCH and PG start streaming WALs. +// wal_backup task continously failing to upload a full segment while the segment remains partial on the disk. +// The consequence is that commit_lsn for all SKs move forward but backup_lsn stays the same. Then, all SKs run out of disk space. +// See go/sk-ood-xlog-switch for more details. +// +// To mitigate this issue, we will re-elect a new offloader if the current offloader is lagging behind too much. +// Each SK makes the decision locally but they are aware of each other's commit and backup lsns. +// +// determine_offloader will pick a SK. say SK-1. +// Each SK checks +// -- if commit_lsn - back_lsn > threshold, +// -- -- remove SK-1 from the candidate and call determine_offloader again. +// SK-1 will step down and all SKs will elect the same leader again. +// After the backup is caught up, the leader will become SK-1 again. +fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option, String) { + let mut offloader: Option; + let mut election_dbg_str: String; + let caughtup_peers_count: usize; + (offloader, election_dbg_str, caughtup_peers_count) = + determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); + + if offloader.is_none() + || caughtup_peers_count <= 1 + || mgr.conf.max_reelect_offloader_lag_bytes == 0 + { + return (offloader, election_dbg_str); + } + + let offloader_sk_id = offloader.unwrap(); + + let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn); + if backup_lag.is_none() { + info!("Backup lag is None. Skipping re-election."); + return (offloader, election_dbg_str); + } + + let backup_lag = backup_lag.unwrap().0; + + if backup_lag < mgr.conf.max_reelect_offloader_lag_bytes { + return (offloader, election_dbg_str); + } + + info!( + "Electing a new leader: Backup lag is too high backup lsn lag {} threshold {}: {}", + backup_lag, mgr.conf.max_reelect_offloader_lag_bytes, election_dbg_str + ); + BACKUP_REELECT_LEADER_COUNT.inc(); + // Remove the current offloader if lag is too high. + let new_peers: Vec<_> = state + .peers + .iter() + .filter(|p| p.sk_id != offloader_sk_id) + .cloned() + .collect(); + (offloader, election_dbg_str, _) = + determine_offloader(&new_peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); + (offloader, election_dbg_str) +} +/* END_HADRON */ + /// The goal is to ensure that normally only one safekeepers offloads. However, /// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short /// time we have several ones as they PUT the same files. Also, @@ -141,13 +208,13 @@ fn determine_offloader( wal_backup_lsn: Lsn, ttid: TenantTimelineId, conf: &SafeKeeperConf, -) -> (Option, String) { +) -> (Option, String, usize) { // TODO: remove this once we fill newly joined safekeepers since backup_lsn. let capable_peers = alive_peers .iter() .filter(|p| p.local_start_lsn <= wal_backup_lsn); match capable_peers.clone().map(|p| p.commit_lsn).max() { - None => (None, "no connected peers to elect from".to_string()), + None => (None, "no connected peers to elect from".to_string(), 0), Some(max_commit_lsn) => { let threshold = max_commit_lsn .checked_sub(conf.max_offloader_lag_bytes) @@ -175,6 +242,7 @@ fn determine_offloader( capable_peers_dbg, caughtup_peers.len() ), + caughtup_peers.len(), ) } } @@ -346,6 +414,8 @@ async fn backup_lsn_range( anyhow::bail!("parallel_jobs must be >= 1"); } + pausable_failpoint!("backup-lsn-range-pausable"); + let remote_timeline_path = &timeline.remote_path; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index aab82fedb5..cba156888c 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -1,15 +1,15 @@ use std::pin::Pin; use std::task::{Context, Poll}; -use bytes::Bytes; -use futures::stream::BoxStream; -use futures::{Stream, StreamExt}; -use safekeeper_api::Term; -use utils::lsn::Lsn; - use crate::send_wal::EndWatch; use crate::timeline::WalResidentTimeline; use crate::wal_storage::WalReader; +use bytes::Bytes; +use futures::stream::BoxStream; +use futures::{Stream, StreamExt}; +use safekeeper_api::Term; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; #[derive(PartialEq, Eq, Debug)] pub(crate) struct WalBytes { @@ -37,6 +37,8 @@ struct PositionedWalReader { pub(crate) struct StreamingWalReader { stream: BoxStream<'static, WalOrReset>, start_changed_tx: tokio::sync::watch::Sender, + // HADRON: Added TenantTimelineId for instrumentation purposes. + pub(crate) ttid: TenantTimelineId, } pub(crate) enum WalOrReset { @@ -63,6 +65,7 @@ impl StreamingWalReader { buffer_size: usize, ) -> Self { let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start); + let ttid = tli.ttid; let state = WalReaderStreamState { tli, @@ -107,6 +110,7 @@ impl StreamingWalReader { Self { stream, start_changed_tx, + ttid, } } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 70e53d86ee..9864ac3229 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -31,7 +31,8 @@ use utils::id::TenantTimelineId; use utils::lsn::Lsn; use crate::metrics::{ - REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, + REMOVED_WAL_SEGMENTS, WAL_DISK_IO_ERRORS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, + time_io_closure, }; use crate::state::TimelinePersistentState; use crate::wal_backup::{WalBackup, read_object, remote_timeline_path}; @@ -293,9 +294,12 @@ impl PhysicalStorage { // half initialized segment, first bake it under tmp filename and // then rename. let tmp_path = self.timeline_dir.join("waltmp"); - let file = File::create(&tmp_path) - .await - .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; + let file: File = File::create(&tmp_path).await.with_context(|| { + /* BEGIN_HADRON */ + WAL_DISK_IO_ERRORS.inc(); + /* END_HADRON */ + format!("Failed to open tmp wal file {:?}", &tmp_path) + })?; fail::fail_point!("sk-zero-segment", |_| { info!("sk-zero-segment failpoint hit"); @@ -382,7 +386,11 @@ impl PhysicalStorage { let flushed = self .write_in_segment(segno, xlogoff, &buf[..bytes_write]) - .await?; + .await + /* BEGIN_HADRON */ + .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?; + /* END_HADRON */ + self.write_lsn += bytes_write as u64; if flushed { self.flush_lsn = self.write_lsn; @@ -491,7 +499,11 @@ impl Storage for PhysicalStorage { } if let Some(unflushed_file) = self.file.take() { - self.fdatasync_file(&unflushed_file).await?; + self.fdatasync_file(&unflushed_file) + .await + /* BEGIN_HADRON */ + .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?; + /* END_HADRON */ self.file = Some(unflushed_file); } else { // We have unflushed data (write_lsn != flush_lsn), but no file. This diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 1fdf8e4949..1f6990c682 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -159,6 +159,10 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { heartbeat_timeout: Duration::from_secs(0), remote_storage: None, max_offloader_lag_bytes: 0, + /* BEGIN_HADRON */ + max_reelect_offloader_lag_bytes: 0, + max_timeline_disk_usage_bytes: 0, + /* END_HADRON */ wal_backup_enabled: false, listen_pg_addr_tenant_only: None, advertise_pg_addr: None, diff --git a/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/down.sql b/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/down.sql new file mode 100644 index 0000000000..bc9b501189 --- /dev/null +++ b/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/down.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause'; \ No newline at end of file diff --git a/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/up.sql b/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/up.sql new file mode 100644 index 0000000000..18c89bed7b --- /dev/null +++ b/storage_controller/migrations/2025-07-02-170751_safekeeper_default_no_pause/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'activating'; \ No newline at end of file diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index a630316f46..7a111f6329 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -6,6 +6,11 @@ use utils::id::NodeId; pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64; +#[derive(Copy, Clone)] +pub(crate) struct Delete { + pub(crate) node_id: NodeId, +} + #[derive(Copy, Clone)] pub(crate) struct Drain { pub(crate) node_id: NodeId, @@ -18,6 +23,7 @@ pub(crate) struct Fill { #[derive(Copy, Clone)] pub(crate) enum Operation { + Delete(Delete), Drain(Drain), Fill(Fill), } @@ -30,6 +36,8 @@ pub(crate) enum OperationError { FinalizeError(Cow<'static, str>), #[error("Operation cancelled")] Cancelled, + #[error("Impossible constraint error: {0}")] + ImpossibleConstraint(Cow<'static, str>), } pub(crate) struct OperationHandler { @@ -38,6 +46,12 @@ pub(crate) struct OperationHandler { pub(crate) cancel: CancellationToken, } +impl Display for Delete { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "delete {}", self.node_id) + } +} + impl Display for Drain { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "drain {}", self.node_id) @@ -53,6 +67,7 @@ impl Display for Fill { impl Display for Operation { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { + Operation::Delete(op) => write!(f, "{op}"), Operation::Drain(op) => write!(f, "{op}"), Operation::Fill(op) => write!(f, "{op}"), } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 66c44b5674..e5a3a969d4 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -919,7 +919,7 @@ async fn handle_node_drop(req: Request) -> Result, ApiError json_response(StatusCode::OK, state.service.node_drop(node_id).await?) } -async fn handle_node_delete(req: Request) -> Result, ApiError> { +async fn handle_node_delete_old(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; let req = match maybe_forward(req).await { @@ -931,7 +931,10 @@ async fn handle_node_delete(req: Request) -> Result, ApiErr let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; - json_response(StatusCode::OK, state.service.node_delete(node_id).await?) + json_response( + StatusCode::OK, + state.service.node_delete_old(node_id).await?, + ) } async fn handle_tombstone_list(req: Request) -> Result, ApiError> { @@ -1051,6 +1054,42 @@ async fn handle_get_leader(req: Request) -> Result, ApiErro json_response(StatusCode::OK, leader) } +async fn handle_node_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response( + StatusCode::OK, + state.service.start_node_delete(node_id).await?, + ) +} + +async fn handle_cancel_node_delete(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Infra)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response( + StatusCode::ACCEPTED, + state.service.cancel_node_delete(node_id).await?, + ) +} + async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Infra)?; @@ -2221,8 +2260,14 @@ pub fn make_router( .post("/control/v1/node", |r| { named_request_span(r, handle_node_register, RequestName("control_v1_node")) }) + // This endpoint is deprecated and will be removed in a future version. + // Use PUT /control/v1/node/:node_id/delete instead. .delete("/control/v1/node/:node_id", |r| { - named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete")) + named_request_span( + r, + handle_node_delete_old, + RequestName("control_v1_node_delete"), + ) }) .get("/control/v1/node", |r| { named_request_span(r, handle_node_list, RequestName("control_v1_node")) @@ -2247,6 +2292,20 @@ pub fn make_router( .get("/control/v1/leader", |r| { named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader")) }) + .put("/control/v1/node/:node_id/delete", |r| { + named_request_span( + r, + handle_node_delete, + RequestName("control_v1_start_node_delete"), + ) + }) + .delete("/control/v1/node/:node_id/delete", |r| { + named_request_span( + r, + handle_cancel_node_delete, + RequestName("control_v1_cancel_node_delete"), + ) + }) .put("/control/v1/node/:node_id/drain", |r| { named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain")) }) @@ -2312,7 +2371,7 @@ pub fn make_router( named_request_span( r, handle_safekeeper_scheduling_policy, - RequestName("v1_safekeeper_status"), + RequestName("v1_safekeeper_scheduling_policy"), ) }) // Tenant Shard operations diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index a9ec511431..36e3c5dc6c 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -6,13 +6,13 @@ extern crate hyper0 as hyper; mod auth; mod background_node_operations; mod compute_hook; -mod drain_utils; mod heartbeater; pub mod http; mod id_lock_map; mod leadership; pub mod metrics; mod node; +mod operation_utils; mod pageserver_client; mod peer_client; pub mod persistence; diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index f7f77cdd23..8738386968 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -76,6 +76,9 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, + /// How many shards are keep-failing and will be ignored when considering to run optimizations + pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index cba007d75f..6642c72f3c 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -201,6 +201,7 @@ impl Node { match self.scheduling { NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization), + NodeSchedulingPolicy::Deleting => MaySchedule::No, NodeSchedulingPolicy::Draining => MaySchedule::No, NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization), NodeSchedulingPolicy::Pause => MaySchedule::No, diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/operation_utils.rs similarity index 64% rename from storage_controller/src/drain_utils.rs rename to storage_controller/src/operation_utils.rs index 0dae7b8147..af86010ab7 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/operation_utils.rs @@ -10,63 +10,19 @@ use crate::node::Node; use crate::scheduler::Scheduler; use crate::tenant_shard::TenantShard; -pub(crate) struct TenantShardIterator { - tenants_accessor: F, - inspected_all_shards: bool, - last_inspected_shard: Option, -} - -/// A simple iterator which can be used in tandem with [`crate::service::Service`] -/// to iterate over all known tenant shard ids without holding the lock on the -/// service state at all times. -impl TenantShardIterator -where - F: Fn(Option) -> Option, -{ - pub(crate) fn new(tenants_accessor: F) -> Self { - Self { - tenants_accessor, - inspected_all_shards: false, - last_inspected_shard: None, - } - } - - /// Returns the next tenant shard id if one exists - pub(crate) fn next(&mut self) -> Option { - if self.inspected_all_shards { - return None; - } - - match (self.tenants_accessor)(self.last_inspected_shard) { - Some(tid) => { - self.last_inspected_shard = Some(tid); - Some(tid) - } - None => { - self.inspected_all_shards = true; - None - } - } - } - - /// Returns true when the end of the iterator is reached and false otherwise - pub(crate) fn finished(&self) -> bool { - self.inspected_all_shards - } -} - /// Check that the state of the node being drained is as expected: -/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`] +/// node is present in memory and scheduling policy is set to expected_policy pub(crate) fn validate_node_state( node_id: &NodeId, nodes: Arc>, + expected_policy: NodeSchedulingPolicy, ) -> Result<(), OperationError> { let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( format!("node {node_id} was removed").into(), ))?; let current_policy = node.get_scheduling(); - if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + if current_policy != expected_policy { // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think // about it return Err(OperationError::NodeStateChanged( @@ -182,55 +138,3 @@ impl TenantShardDrain { } } } - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use utils::id::TenantId; - use utils::shard::{ShardCount, ShardNumber, TenantShardId}; - - use super::TenantShardIterator; - - #[test] - fn test_tenant_shard_iterator() { - let tenant_id = TenantId::generate(); - let shard_count = ShardCount(8); - - let mut tenant_shards = Vec::default(); - for i in 0..shard_count.0 { - tenant_shards.push(( - TenantShardId { - tenant_id, - shard_number: ShardNumber(i), - shard_count, - }, - (), - )) - } - - let tenant_shards = Arc::new(tenant_shards); - - let mut tid_iter = TenantShardIterator::new({ - let tenants = tenant_shards.clone(); - move |last_inspected_shard: Option| { - let entry = match last_inspected_shard { - Some(skip_past) => { - let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); - cursor.nth(1) - } - None => tenants.first(), - }; - - entry.map(|(tid, _)| tid).copied() - } - }); - - let mut iterated_over = Vec::default(); - while let Some(tid) = tid_iter.next() { - iterated_over.push((tid, ())); - } - - assert_eq!(iterated_over, *tenant_shards); - } -} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 56f4d03111..ed9a268064 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -635,18 +635,23 @@ impl Persistence { let updated = self .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { Box::pin(async move { - // Check if the node is not marked as deleted - let deleted_node: i64 = nodes + let node: Option = nodes .filter(node_id.eq(input_node_id.0 as i64)) - .filter(lifecycle.eq(String::from(NodeLifecycle::Deleted))) - .count() - .get_result(conn) - .await?; - if deleted_node > 0 { - return Err(DatabaseError::Logical(format!( - "Node {input_node_id} is marked as deleted, re-attach is not allowed" - ))); - } + .first::(conn) + .await + .optional()?; + + // Check if the node is not marked as deleted + match node { + Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => { + return Err(DatabaseError::Logical(format!( + "Node {input_node_id} is marked as deleted, re-attach is not allowed" + ))); + } + _ => { + // go through + } + }; let rows_updated = diesel::update(tenant_shards) .filter(generation_pageserver.eq(input_node_id.0 as i64)) @@ -664,21 +669,23 @@ impl Persistence { .load(conn) .await?; - // If the node went through a drain and restart phase before re-attaching, - // then reset it's node scheduling policy to active. - diesel::update(nodes) - .filter(node_id.eq(input_node_id.0 as i64)) - .filter( - scheduling_policy - .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) - .or(scheduling_policy - .eq(String::from(NodeSchedulingPolicy::Draining))) - .or(scheduling_policy - .eq(String::from(NodeSchedulingPolicy::Filling))), - ) - .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) - .execute(conn) - .await?; + if let Some(node) = node { + let old_scheduling_policy = + NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap(); + let new_scheduling_policy = match old_scheduling_policy { + NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active, + NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active, + NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active, + NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active, + NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause, + NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause, + }; + diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set(scheduling_policy.eq(String::from(new_scheduling_policy))) + .execute(conn) + .await?; + } Ok(updated) }) @@ -1388,6 +1395,48 @@ impl Persistence { .await } + /// Activate the given safekeeper, ensuring that there is no TOCTOU. + /// Returns `Some` if the safekeeper has indeed been activating (or already active). Other states return `None`. + pub(crate) async fn activate_safekeeper(&self, id_: i64) -> Result, DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| { + Box::pin(async move { + #[derive(Insertable, AsChangeset)] + #[diesel(table_name = crate::schema::safekeepers)] + struct UpdateSkSchedulingPolicy<'a> { + id: i64, + scheduling_policy: &'a str, + } + let scheduling_policy_active = String::from(SkSchedulingPolicy::Active); + let scheduling_policy_activating = String::from(SkSchedulingPolicy::Activating); + + let rows_affected = diesel::update( + safekeepers.filter(id.eq(id_)).filter( + scheduling_policy + .eq(scheduling_policy_activating) + .or(scheduling_policy.eq(&scheduling_policy_active)), + ), + ) + .set(scheduling_policy.eq(&scheduling_policy_active)) + .execute(conn) + .await?; + + if rows_affected == 0 { + return Ok(Some(())); + } + if rows_affected != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({rows_affected})", + ))); + } + + Ok(Some(())) + }) + }) + .await + } + /// Persist timeline. Returns if the timeline was newly inserted. If it wasn't, we haven't done any writes. pub(crate) async fn insert_timeline(&self, entry: TimelinePersistence) -> DatabaseResult { use crate::schema::timelines; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e4c494db8f..403ae15b59 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,8 +1,8 @@ pub mod chaos_injector; -mod context_iterator; pub mod feature_flag; pub(crate) mod safekeeper_reconciler; mod safekeeper_service; +mod tenant_shard_iterator; use std::borrow::Cow; use std::cmp::Ordering; @@ -16,7 +16,6 @@ use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; use anyhow::Context; -use context_iterator::TenantShardContextIterator; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, }; @@ -31,8 +30,8 @@ use pageserver_api::controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, - TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, - TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, + SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ @@ -55,6 +54,7 @@ use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; use safekeeper_api::models::SafekeeperUtilization; use safekeeper_reconciler::SafekeeperReconcilers; +use tenant_shard_iterator::{TenantShardExclusiveIterator, create_shared_shard_iterator}; use tokio::sync::TryAcquireError; use tokio::sync::mpsc::error::TrySendError; use tokio_util::sync::CancellationToken; @@ -68,10 +68,9 @@ use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; use crate::background_node_operations::{ - Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, + Delete, Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler, }; use crate::compute_hook::{self, ComputeHook, NotifyError}; -use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator}; use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState}; use crate::id_lock_map::{ IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock, @@ -79,6 +78,7 @@ use crate::id_lock_map::{ use crate::leadership::Leadership; use crate::metrics; use crate::node::{AvailabilityTransition, Node}; +use crate::operation_utils::{self, TenantShardDrain}; use crate::pageserver_client::PageserverClient; use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; @@ -105,7 +105,7 @@ use crate::timeline_import::{ TimelineImportFinalizeError, TimelineImportState, UpcallClient, }; -const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); +const WAITER_OPERATION_POLL_TIMEOUT: Duration = Duration::from_millis(500); // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -210,6 +210,10 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; +// Number of consecutive reconciliation errors, occured for one shard, +// after which the shard is ignored when considering to run optimizations. +const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5; + // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly // than they're being pushed onto the queue. @@ -577,7 +581,9 @@ impl From for ApiError { impl From for ApiError { fn from(value: OperationError) -> Self { match value { - OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => { + OperationError::NodeStateChanged(err) + | OperationError::FinalizeError(err) + | OperationError::ImpossibleConstraint(err) => { ApiError::InternalServerError(anyhow::anyhow!(err)) } OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()), @@ -702,6 +708,36 @@ struct ShardMutationLocations { #[derive(Default, Clone)] struct TenantMutationLocations(BTreeMap); +struct ReconcileAllResult { + spawned_reconciles: usize, + keep_failing_reconciles: usize, + has_delayed_reconciles: bool, +} + +impl ReconcileAllResult { + fn new( + spawned_reconciles: usize, + keep_failing_reconciles: usize, + has_delayed_reconciles: bool, + ) -> Self { + assert!( + spawned_reconciles >= keep_failing_reconciles, + "It is impossible to have more keep-failing reconciles than spawned reconciles" + ); + Self { + spawned_reconciles, + keep_failing_reconciles, + has_delayed_reconciles, + } + } + + /// We can run optimizations only if we don't have any delayed reconciles and + /// all spawned reconciles are also keep-failing reconciles. + fn can_run_optimizations(&self) -> bool { + !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles + } +} + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -899,7 +935,7 @@ impl Service { // which require it: under normal circumstances this should only include tenants that were in some // transient state before we restarted, or any tenants whose compute hooks failed above. tracing::info!("Checking for shards in need of reconciliation..."); - let reconcile_tasks = self.reconcile_all(); + let reconcile_all_result = self.reconcile_all(); // We will not wait for these reconciliation tasks to run here: we're now done with startup and // normal operations may proceed. @@ -947,8 +983,9 @@ impl Service { } } + let spawned_reconciles = reconcile_all_result.spawned_reconciles; tracing::info!( - "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)" + "Startup complete, spawned {spawned_reconciles} reconciliation tasks ({shard_count} shards total)" ); } @@ -1199,8 +1236,8 @@ impl Service { while !self.reconcilers_cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { - let reconciles_spawned = self.reconcile_all(); - if reconciles_spawned == 0 { + let reconcile_all_result = self.reconcile_all(); + if reconcile_all_result.can_run_optimizations() { // Run optimizer only when we didn't find any other work to do self.optimize_all().await; } @@ -1214,7 +1251,7 @@ impl Service { } /// Heartbeat all storage nodes once in a while. #[instrument(skip_all)] - async fn spawn_heartbeat_driver(&self) { + async fn spawn_heartbeat_driver(self: &Arc) { self.startup_complete.clone().wait().await; let mut interval = tokio::time::interval(self.config.heartbeat_interval); @@ -1341,18 +1378,51 @@ impl Service { } } if let Ok(deltas) = res_sk { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - for (id, state) in deltas.0 { - let Some(sk) = safekeepers.get_mut(&id) else { - tracing::info!( - "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" - ); - continue; - }; - sk.set_availability(state); + let mut to_activate = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + + for (id, state) in deltas.0 { + let Some(sk) = safekeepers.get_mut(&id) else { + tracing::info!( + "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}" + ); + continue; + }; + if sk.scheduling_policy() == SkSchedulingPolicy::Activating + && let SafekeeperState::Available { .. } = state + { + to_activate.push(id); + } + sk.set_availability(state); + } + locked.safekeepers = Arc::new(safekeepers); + } + for sk_id in to_activate { + // TODO this can race with set_scheduling_policy (can create disjoint DB <-> in-memory state) + tracing::info!("Activating safekeeper {sk_id}"); + match self.persistence.activate_safekeeper(sk_id.0 as i64).await { + Ok(Some(())) => {} + Ok(None) => { + tracing::info!( + "safekeeper {sk_id} has been removed from db or has different scheduling policy than active or activating" + ); + } + Err(e) => { + tracing::warn!("couldn't apply activation of {sk_id} to db: {e}"); + continue; + } + } + if let Err(e) = self + .set_safekeeper_scheduling_policy_in_mem(sk_id, SkSchedulingPolicy::Active) + .await + { + tracing::info!("couldn't activate safekeeper {sk_id} in memory: {e}"); + continue; + } + tracing::info!("Activation of safekeeper {sk_id} done"); } - locked.safekeepers = Arc::new(safekeepers); } } } @@ -1408,6 +1478,7 @@ impl Service { match result.result { Ok(()) => { + tenant.consecutive_errors_count = 0; tenant.apply_observed_deltas(deltas); tenant.waiter.advance(result.sequence); } @@ -1426,6 +1497,8 @@ impl Service { } } + tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1); + // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. tenant.set_last_error(result.sequence, e); @@ -2343,6 +2416,7 @@ impl Service { NodeSchedulingPolicy::PauseForRestart | NodeSchedulingPolicy::Draining | NodeSchedulingPolicy::Filling + | NodeSchedulingPolicy::Deleting ); let mut new_nodes = (**nodes).clone(); @@ -6984,7 +7058,7 @@ impl Service { /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense /// that we don't leave any bad state behind in the storage controller, but unclean /// in the sense that we are not carefully draining the node. - pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> { + pub(crate) async fn node_delete_old(&self, node_id: NodeId) -> Result<(), ApiError> { let _node_lock = trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await; @@ -7018,7 +7092,7 @@ impl Service { } for (_tenant_id, mut schedule_context, shards) in - TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal) { for shard in shards { if shard.deref_node(node_id) { @@ -7087,6 +7161,174 @@ impl Service { Ok(()) } + pub(crate) async fn delete_node( + self: &Arc, + node_id: NodeId, + policy_on_start: NodeSchedulingPolicy, + cancel: CancellationToken, + ) -> Result<(), OperationError> { + let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build(); + + let mut waiters: Vec = Vec::new(); + let mut tid_iter = create_shared_shard_iterator(self.clone()); + + while !tid_iter.finished() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(policy_on_start)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}", + node_id, String::from(policy_on_start), err + ) + .into(), + )); + } + } + } + + operation_utils::validate_node_state( + &node_id, + self.inner.read().unwrap().nodes.clone(), + NodeSchedulingPolicy::Deleting, + )?; + + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let tid = match tid_iter.next() { + Some(tid) => tid, + None => { + break; + } + }; + + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + // Calculate a schedule context here to avoid borrow checker issues. + let mut schedule_context = ScheduleContext::default(); + for (_, shard) in tenants.range(TenantShardId::tenant_range(tid.tenant_id)) { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + let tenant_shard = match tenants.get_mut(&tid) { + Some(tenant_shard) => tenant_shard, + None => { + // Tenant shard was deleted by another operation. Skip it. + continue; + } + }; + + match tenant_shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => { + // A migration during delete is classed as 'essential' because it is required to + // uphold our availability goals for the tenant: this shard is elegible for migration. + } + ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { + // If we have been asked to avoid rescheduling this shard, then do not migrate it during a deletion + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Skip migration during deletion because shard scheduling policy {:?} disallows it", + tenant_shard.get_scheduling_policy(), + ); + continue; + } + } + + if tenant_shard.deref_node(node_id) { + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { + tracing::error!( + "Refusing to delete node, shard {} can't be rescheduled: {e}", + tenant_shard.tenant_shard_id + ); + return Err(OperationError::ImpossibleConstraint(e.to_string().into())); + } else { + tracing::info!( + "Rescheduled shard {} away from node during deletion", + tenant_shard.tenant_shard_id + ) + } + + let waiter = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ); + if let Some(some) = waiter { + waiters.push(some); + } + } + } + + waiters = self + .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT) + .await; + + failpoint_support::sleep_millis_async!("sleepy-delete-loop", &cancel); + } + + while !waiters.is_empty() { + if cancel.is_cancelled() { + match self + .node_configure(node_id, None, Some(policy_on_start)) + .await + { + Ok(()) => return Err(OperationError::Cancelled), + Err(err) => { + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}", + node_id, String::from(policy_on_start), err + ) + .into(), + )); + } + } + } + + tracing::info!("Awaiting {} pending delete reconciliations", waiters.len()); + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + self.persistence + .set_tombstone(node_id) + .await + .map_err(|e| OperationError::FinalizeError(e.to_string().into()))?; + + { + let mut locked = self.inner.write().unwrap(); + let (nodes, _, scheduler) = locked.parts_mut(); + + scheduler.node_remove(node_id); + + let mut nodes_mut = (**nodes).clone(); + if let Some(mut removed_node) = nodes_mut.remove(&node_id) { + // Ensure that any reconciler holding an Arc<> to this node will + // drop out when trying to RPC to it (setting Offline state sets the + // cancellation token on the Node object). + removed_node.set_availability(NodeAvailability::Offline); + } + *nodes = Arc::new(nodes_mut); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_nodes + .set(nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(nodes.values().filter(|n| n.has_https_port()).count() as i64); + } + + Ok(()) + } + pub(crate) async fn node_list(&self) -> Result, ApiError> { let nodes = { self.inner @@ -7475,7 +7717,7 @@ impl Service { let mut tenants_affected: usize = 0; for (_tenant_id, mut schedule_context, shards) in - TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal) { for tenant_shard in shards { let tenant_shard_id = tenant_shard.tenant_shard_id; @@ -7646,6 +7888,142 @@ impl Service { self.node_configure(node_id, availability, scheduling).await } + pub(crate) async fn start_node_delete( + self: &Arc, + node_id: NodeId, + ) -> Result<(), ApiError> { + let (ongoing_op, node_policy, schedulable_nodes_count) = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + let schedulable_nodes_count = nodes + .iter() + .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_))) + .count(); + + ( + locked + .ongoing_operation + .as_ref() + .map(|ongoing| ongoing.operation), + node.get_scheduling(), + schedulable_nodes_count, + ) + }; + + if let Some(ongoing) = ongoing_op { + return Err(ApiError::PreconditionFailed( + format!("Background operation already ongoing for node: {ongoing}").into(), + )); + } + + if schedulable_nodes_count == 0 { + return Err(ApiError::PreconditionFailed( + "No other schedulable nodes to move shards".into(), + )); + } + + match node_policy { + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => { + self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Deleting)) + .await?; + + let cancel = self.cancel.child_token(); + let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?; + let policy_on_start = node_policy; + + self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { + operation: Operation::Delete(Delete { node_id }), + cancel: cancel.clone(), + }); + + let span = tracing::info_span!(parent: None, "delete_node", %node_id); + + tokio::task::spawn( + { + let service = self.clone(); + let cancel = cancel.clone(); + async move { + let _gate_guard = gate_guard; + + scopeguard::defer! { + let prev = service.inner.write().unwrap().ongoing_operation.take(); + + if let Some(Operation::Delete(removed_delete)) = prev.map(|h| h.operation) { + assert_eq!(removed_delete.node_id, node_id, "We always take the same operation"); + } else { + panic!("We always remove the same operation") + } + } + + tracing::info!("Delete background operation starting"); + let res = service + .delete_node(node_id, policy_on_start, cancel) + .await; + match res { + Ok(()) => { + tracing::info!( + "Delete background operation completed successfully" + ); + } + Err(OperationError::Cancelled) => { + tracing::info!("Delete background operation was cancelled"); + } + Err(err) => { + tracing::error!( + "Delete background operation encountered: {err}" + ) + } + } + } + } + .instrument(span), + ); + } + NodeSchedulingPolicy::Deleting => { + return Err(ApiError::Conflict(format!( + "Node {node_id} has delete in progress" + ))); + } + policy => { + return Err(ApiError::PreconditionFailed( + format!("Node {node_id} cannot be deleted due to {policy:?} policy").into(), + )); + } + } + + Ok(()) + } + + pub(crate) async fn cancel_node_delete( + self: &Arc, + node_id: NodeId, + ) -> Result<(), ApiError> { + { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + } + + if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() { + if let Operation::Delete(delete) = op_handler.operation { + if delete.node_id == node_id { + tracing::info!("Cancelling background delete operation for node {node_id}"); + op_handler.cancel.cancel(); + return Ok(()); + } + } + } + + Err(ApiError::PreconditionFailed( + format!("Node {node_id} has no delete in progress").into(), + )) + } + pub(crate) async fn start_node_drain( self: &Arc, node_id: NodeId, @@ -8026,7 +8404,7 @@ impl Service { /// Returns how many reconciliation tasks were started, or `1` if no reconciles were /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where /// available. A return value of 0 indicates that everything is fully reconciled already. - fn reconcile_all(&self) -> usize { + fn reconcile_all(&self) -> ReconcileAllResult { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); @@ -8034,13 +8412,16 @@ impl Service { // This function is an efficient place to update lazy statistics, since we are walking // all tenants. let mut pending_reconciles = 0; + let mut keep_failing_reconciles = 0; let mut az_violations = 0; // If we find any tenants to drop from memory, stash them to offload after // we're done traversing the map of tenants. let mut drop_detached_tenants = Vec::new(); - let mut reconciles_spawned = 0; + let mut spawned_reconciles = 0; + let mut has_delayed_reconciles = false; + for shard in tenants.values_mut() { // Accumulate scheduling statistics if let (Some(attached), Some(preferred)) = @@ -8060,18 +8441,32 @@ impl Service { // If there is something delayed, then return a nonzero count so that // callers like reconcile_all_now do not incorrectly get the impression // that the system is in a quiescent state. - reconciles_spawned = std::cmp::max(1, reconciles_spawned); + has_delayed_reconciles = true; pending_reconciles += 1; continue; } // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another one + let consecutive_errors_count = shard.consecutive_errors_count; if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() { - reconciles_spawned += 1; + spawned_reconciles += 1; + + // Count shards that are keep-failing. We still want to reconcile them + // to avoid a situation where a shard is stuck. + // But we don't want to consider them when deciding to run optimizations. + if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS { + tracing::warn!( + tenant_id=%shard.tenant_shard_id.tenant_id, + shard_id=%shard.tenant_shard_id.shard_slug(), + "Shard reconciliation is keep-failing: {} errors", + consecutive_errors_count + ); + keep_failing_reconciles += 1; + } } else if shard.delayed_reconcile { // Shard wanted to reconcile but for some reason couldn't. pending_reconciles += 1; @@ -8110,7 +8505,16 @@ impl Service { .storage_controller_pending_reconciles .set(pending_reconciles as i64); - reconciles_spawned + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_keep_failing_reconciles + .set(keep_failing_reconciles as i64); + + ReconcileAllResult::new( + spawned_reconciles, + keep_failing_reconciles, + has_delayed_reconciles, + ) } /// `optimize` in this context means identifying shards which have valid scheduled locations, but @@ -8196,7 +8600,7 @@ impl Service { // to ignore the utilisation component of the score. for (_tenant_id, schedule_context, shards) in - TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) + TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative) { for shard in shards { if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { @@ -8783,13 +9187,13 @@ impl Service { /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should /// put the system into a quiescent state where future background reconciliations won't do anything. pub(crate) async fn reconcile_all_now(&self) -> Result { - let reconciles_spawned = self.reconcile_all(); - let reconciles_spawned = if reconciles_spawned == 0 { + let reconcile_all_result = self.reconcile_all(); + let mut spawned_reconciles = reconcile_all_result.spawned_reconciles; + if reconcile_all_result.can_run_optimizations() { // Only optimize when we are otherwise idle - self.optimize_all().await - } else { - reconciles_spawned - }; + let optimization_reconciles = self.optimize_all().await; + spawned_reconciles += optimization_reconciles; + } let waiters = { let mut waiters = Vec::new(); @@ -8826,11 +9230,11 @@ impl Service { tracing::info!( "{} reconciles in reconcile_all, {} waiters", - reconciles_spawned, + spawned_reconciles, waiter_count ); - Ok(std::cmp::max(waiter_count, reconciles_spawned)) + Ok(std::cmp::max(waiter_count, spawned_reconciles)) } async fn stop_reconciliations(&self, reason: StopReconciliationsReason) { @@ -8923,25 +9327,7 @@ impl Service { let mut waiters = Vec::new(); - let mut tid_iter = TenantShardIterator::new({ - let service = self.clone(); - move |last_inspected_shard: Option| { - let locked = &service.inner.read().unwrap(); - let tenants = &locked.tenants; - let entry = match last_inspected_shard { - Some(skip_past) => { - // Skip to the last seen tenant shard id - let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); - - // Skip past the last seen - cursor.nth(1) - } - None => tenants.first_key_value(), - }; - - entry.map(|(tid, _)| tid).copied() - } - }); + let mut tid_iter = create_shared_shard_iterator(self.clone()); while !tid_iter.finished() { if cancel.is_cancelled() { @@ -8961,7 +9347,11 @@ impl Service { } } - drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?; + operation_utils::validate_node_state( + &node_id, + self.inner.read().unwrap().nodes.clone(), + NodeSchedulingPolicy::Draining, + )?; while waiters.len() < MAX_RECONCILES_PER_OPERATION { let tid = match tid_iter.next() { @@ -9041,7 +9431,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT) .await; failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); @@ -9335,7 +9725,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT) .await; } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index cf48b007b2..d7179372b2 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -236,40 +236,30 @@ impl Service { F: std::future::Future> + Send + 'static, T: Sync + Send + 'static, { + let target_sk_count = safekeepers.len(); + + if target_sk_count == 0 { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "timeline configured without any safekeepers" + ))); + } + + if target_sk_count < self.config.timeline_safekeeper_count { + tracing::warn!( + "running a quorum operation with {} safekeepers, which is less than configured {} safekeepers per timeline", + target_sk_count, + self.config.timeline_safekeeper_count + ); + } + let results = self .tenant_timeline_safekeeper_op(safekeepers, op, timeout) .await?; // Now check if quorum was reached in results. - let target_sk_count = safekeepers.len(); - let quorum_size = match target_sk_count { - 0 => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "timeline configured without any safekeepers", - ))); - } - 1 | 2 => { - #[cfg(feature = "testing")] - { - // In test settings, it is allowed to have one or two safekeepers - target_sk_count - } - #[cfg(not(feature = "testing"))] - { - // The region is misconfigured: we need at least three safekeepers to be configured - // in order to schedule work to them - tracing::warn!( - "couldn't find at least 3 safekeepers for timeline, found: {:?}", - target_sk_count - ); - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find at least 3 safekeepers to put timeline to" - ))); - } - } - _ => target_sk_count / 2 + 1, - }; + let quorum_size = target_sk_count / 2 + 1; + let success_count = results.iter().filter(|res| res.is_ok()).count(); if success_count < quorum_size { // Failure @@ -815,7 +805,7 @@ impl Service { Safekeeper::from_persistence( crate::persistence::SafekeeperPersistence::from_upsert( record, - SkSchedulingPolicy::Pause, + SkSchedulingPolicy::Activating, ), CancellationToken::new(), use_https, @@ -856,27 +846,36 @@ impl Service { .await?; let node_id = NodeId(id as u64); // After the change has been persisted successfully, update the in-memory state - { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - let sk = safekeepers - .get_mut(&node_id) - .ok_or(DatabaseError::Logical("Not found".to_string()))?; - sk.set_scheduling_policy(scheduling_policy); + self.set_safekeeper_scheduling_policy_in_mem(node_id, scheduling_policy) + .await + } - match scheduling_policy { - SkSchedulingPolicy::Active => { - locked - .safekeeper_reconcilers - .start_reconciler(node_id, self); - } - SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { - locked.safekeeper_reconcilers.stop_reconciler(node_id); - } + pub(crate) async fn set_safekeeper_scheduling_policy_in_mem( + self: &Arc, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.set_scheduling_policy(scheduling_policy); + + match scheduling_policy { + SkSchedulingPolicy::Active => { + locked + .safekeeper_reconcilers + .start_reconciler(node_id, self); + } + SkSchedulingPolicy::Decomissioned + | SkSchedulingPolicy::Pause + | SkSchedulingPolicy::Activating => { + locked.safekeeper_reconcilers.stop_reconciler(node_id); } - - locked.safekeepers = Arc::new(safekeepers); } + + locked.safekeepers = Arc::new(safekeepers); Ok(()) } @@ -915,13 +914,13 @@ impl Service { // so it isn't counted toward the quorum. if let Some(min_position) = min_position { if let Ok(ok_res) = &res { - if (ok_res.term, ok_res.flush_lsn) < min_position { + if (ok_res.last_log_term, ok_res.flush_lsn) < min_position { // Use Error::Timeout to make this error retriable. res = Err(mgmt_api::Error::Timeout( format!( "safekeeper {} returned position {:?} which is less than minimum required position {:?}", client.node_id_label(), - (ok_res.term, ok_res.flush_lsn), + (ok_res.last_log_term, ok_res.flush_lsn), min_position ) )); @@ -1217,7 +1216,7 @@ impl Service { let mut sync_position = (INITIAL_TERM, Lsn::INVALID); for res in results.into_iter().flatten() { - let sk_position = (res.term, res.flush_lsn); + let sk_position = (res.last_log_term, res.flush_lsn); if sync_position < sk_position { sync_position = sk_position; } diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/tenant_shard_iterator.rs similarity index 52% rename from storage_controller/src/service/context_iterator.rs rename to storage_controller/src/service/tenant_shard_iterator.rs index c4784e5e36..576b94b3a4 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/tenant_shard_iterator.rs @@ -1,4 +1,5 @@ use std::collections::BTreeMap; +use std::sync::Arc; use utils::id::TenantId; use utils::shard::TenantShardId; @@ -6,16 +7,21 @@ use utils::shard::TenantShardId; use crate::scheduler::{ScheduleContext, ScheduleMode}; use crate::tenant_shard::TenantShard; +use super::Service; + +/// Exclusive iterator over all tenant shards. +/// It is used to iterate over consistent tenants state at specific point in time. +/// /// When making scheduling decisions, it is useful to have the ScheduleContext for a whole /// tenant while considering the individual shards within it. This iterator is a helper /// that gathers all the shards in a tenant and then yields them together with a ScheduleContext /// for the tenant. -pub(super) struct TenantShardContextIterator<'a> { +pub(super) struct TenantShardExclusiveIterator<'a> { schedule_mode: ScheduleMode, inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>, } -impl<'a> TenantShardContextIterator<'a> { +impl<'a> TenantShardExclusiveIterator<'a> { pub(super) fn new( tenants: &'a mut BTreeMap, schedule_mode: ScheduleMode, @@ -27,7 +33,7 @@ impl<'a> TenantShardContextIterator<'a> { } } -impl<'a> Iterator for TenantShardContextIterator<'a> { +impl<'a> Iterator for TenantShardExclusiveIterator<'a> { type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>); fn next(&mut self) -> Option { @@ -52,13 +58,93 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { } } +/// Shared iterator over all tenant shards. +/// It is used to iterate over all tenants without blocking another code, working with tenants +/// +/// A simple iterator which can be used in tandem with [`crate::service::Service`] +/// to iterate over all known tenant shard ids without holding the lock on the +/// service state at all times. +pub(crate) struct TenantShardSharedIterator { + tenants_accessor: F, + inspected_all_shards: bool, + last_inspected_shard: Option, +} + +impl TenantShardSharedIterator +where + F: Fn(Option) -> Option, +{ + pub(crate) fn new(tenants_accessor: F) -> Self { + Self { + tenants_accessor, + inspected_all_shards: false, + last_inspected_shard: None, + } + } + + pub(crate) fn finished(&self) -> bool { + self.inspected_all_shards + } +} + +impl Iterator for TenantShardSharedIterator +where + F: Fn(Option) -> Option, +{ + // TODO(ephemeralsad): consider adding schedule context to the iterator + type Item = TenantShardId; + + /// Returns the next tenant shard id if one exists + fn next(&mut self) -> Option { + if self.inspected_all_shards { + return None; + } + + match (self.tenants_accessor)(self.last_inspected_shard) { + Some(tid) => { + self.last_inspected_shard = Some(tid); + Some(tid) + } + None => { + self.inspected_all_shards = true; + None + } + } + } +} + +pub(crate) fn create_shared_shard_iterator( + service: Arc, +) -> TenantShardSharedIterator) -> Option> { + let tenants_accessor = move |last_inspected_shard: Option| { + let locked = &service.inner.read().unwrap(); + let tenants = &locked.tenants; + let entry = match last_inspected_shard { + Some(skip_past) => { + // Skip to the last seen tenant shard id + let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past); + + // Skip past the last seen + cursor.nth(1) + } + None => tenants.first_key_value(), + }; + + entry.map(|(tid, _)| tid).copied() + }; + + TenantShardSharedIterator::new(tenants_accessor) +} + #[cfg(test)] mod tests { use std::collections::BTreeMap; use std::str::FromStr; + use std::sync::Arc; use pageserver_api::controller_api::PlacementPolicy; - use utils::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + use utils::shard::{ShardCount, ShardNumber, TenantShardId}; use super::*; use crate::scheduler::test_utils::make_test_nodes; @@ -66,7 +152,7 @@ mod tests { use crate::tenant_shard::tests::make_test_tenant_with_id; #[test] - fn test_context_iterator() { + fn test_exclusive_shard_iterator() { // Hand-crafted tenant IDs to ensure they appear in the expected order when put into // a btreemap & iterated let mut t_1_shards = make_test_tenant_with_id( @@ -106,7 +192,7 @@ mod tests { shard.schedule(&mut scheduler, &mut context).unwrap(); } - let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative); + let mut iter = TenantShardExclusiveIterator::new(&mut tenants, ScheduleMode::Speculative); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t1_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); @@ -132,4 +218,46 @@ mod tests { shard.intent.clear(&mut scheduler); } } + + #[test] + fn test_shared_shard_iterator() { + let tenant_id = TenantId::generate(); + let shard_count = ShardCount(8); + + let mut tenant_shards = Vec::default(); + for i in 0..shard_count.0 { + tenant_shards.push(( + TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count, + }, + (), + )) + } + + let tenant_shards = Arc::new(tenant_shards); + + let tid_iter = TenantShardSharedIterator::new({ + let tenants = tenant_shards.clone(); + move |last_inspected_shard: Option| { + let entry = match last_inspected_shard { + Some(skip_past) => { + let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past); + cursor.nth(1) + } + None => tenants.first(), + }; + + entry.map(|(tid, _)| tid).copied() + } + }); + + let mut iterated_over = Vec::default(); + for tid in tid_iter { + iterated_over.push((tid, ())); + } + + assert_eq!(iterated_over, *tenant_shards); + } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 359921ecbf..0bfca5385e 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -131,6 +131,15 @@ pub(crate) struct TenantShard { #[serde(serialize_with = "read_last_error")] pub(crate) last_error: std::sync::Arc>>>, + /// Number of consecutive reconciliation errors that have occurred for this shard. + /// + /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard + /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to + /// allowing optimizations to run even with some failing shards. + /// + /// The counter is reset to 0 after a successful reconciliation. + pub(crate) consecutive_errors_count: usize, + /// If we have a pending compute notification that for some reason we weren't able to send, /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope @@ -594,6 +603,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), + consecutive_errors_count: 0, pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), preferred_node: None, @@ -1859,6 +1869,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence::initial())), error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), + consecutive_errors_count: 0, pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index f5be544439..294c52321b 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -57,6 +57,8 @@ class EndpointHttpClient(requests.Session): self.auth = BearerAuth(jwt) self.mount("http://", HTTPAdapter()) + self.prewarm_url = f"http://localhost:{external_port}/lfc/prewarm" + self.offload_url = f"http://localhost:{external_port}/lfc/offload" def dbs_and_roles(self): res = self.get(f"http://localhost:{self.external_port}/dbs_and_roles", auth=self.auth) @@ -64,33 +66,39 @@ class EndpointHttpClient(requests.Session): return res.json() def prewarm_lfc_status(self) -> dict[str, str]: - res = self.get(f"http://localhost:{self.external_port}/lfc/prewarm") + res = self.get(self.prewarm_url) res.raise_for_status() json: dict[str, str] = res.json() return json def prewarm_lfc(self, from_endpoint_id: str | None = None): - url: str = f"http://localhost:{self.external_port}/lfc/prewarm" params = {"from_endpoint": from_endpoint_id} if from_endpoint_id else dict() - self.post(url, params=params).raise_for_status() + self.post(self.prewarm_url, params=params).raise_for_status() + self.prewarm_lfc_wait() + def prewarm_lfc_wait(self): def prewarmed(): json = self.prewarm_lfc_status() status, err = json["status"], json.get("error") - assert status == "completed", f"{status}, error {err}" + assert status == "completed", f"{status}, {err=}" wait_until(prewarmed, timeout=60) - def offload_lfc(self): - url = f"http://localhost:{self.external_port}/lfc/offload" - self.post(url).raise_for_status() + def offload_lfc_status(self) -> dict[str, str]: + res = self.get(self.offload_url) + res.raise_for_status() + json: dict[str, str] = res.json() + return json + def offload_lfc(self): + self.post(self.offload_url).raise_for_status() + self.offload_lfc_wait() + + def offload_lfc_wait(self): def offloaded(): - res = self.get(url) - res.raise_for_status() - json = res.json() + json = self.offload_lfc_status() status, err = json["status"], json.get("error") - assert status == "completed", f"{status}, error {err}" + assert status == "completed", f"{status}, {err=}" wait_until(offloaded) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 3bebf83014..65a4ddd160 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( ) PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( + # BEGIN_HADRON + "pageserver_active_storage_operations_count", + # END_HADRON "pageserver_current_logical_size", "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 1b09e5bdd0..1abd3396e4 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -568,6 +568,8 @@ class NeonLocalCli(AbstractNeonCli): timeout: str | None = None, env: dict[str, str] | None = None, dev: bool = False, + autoprewarm: bool = False, + offload_lfc_interval_seconds: int | None = None, ) -> subprocess.CompletedProcess[str]: args = [ "endpoint", @@ -593,6 +595,10 @@ class NeonLocalCli(AbstractNeonCli): args.extend(["--create-test-user"]) if timeout is not None: args.extend(["--start-timeout", str(timeout)]) + if autoprewarm: + args.extend(["--autoprewarm"]) + if offload_lfc_interval_seconds is not None: + args.extend(["--offload-lfc-interval-seconds", str(offload_lfc_interval_seconds)]) if dev: args.extend(["--dev"]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 60a88e1053..f54d5be635 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1875,6 +1875,7 @@ class PageserverSchedulingPolicy(StrEnum): FILLING = "Filling" PAUSE = "Pause" PAUSE_FOR_RESTART = "PauseForRestart" + DELETING = "Deleting" class StorageControllerLeadershipStatus(StrEnum): @@ -2083,14 +2084,30 @@ class NeonStorageController(MetricsGetter, LogUtils): headers=self.headers(TokenScope.ADMIN), ) - def node_delete(self, node_id): - log.info(f"node_delete({node_id})") + def node_delete_old(self, node_id): + log.info(f"node_delete_old({node_id})") self.request( "DELETE", f"{self.api}/control/v1/node/{node_id}", headers=self.headers(TokenScope.ADMIN), ) + def node_delete(self, node_id): + log.info(f"node_delete({node_id})") + self.request( + "PUT", + f"{self.api}/control/v1/node/{node_id}/delete", + headers=self.headers(TokenScope.ADMIN), + ) + + def cancel_node_delete(self, node_id): + log.info(f"cancel_node_delete({node_id})") + self.request( + "DELETE", + f"{self.api}/control/v1/node/{node_id}/delete", + headers=self.headers(TokenScope.ADMIN), + ) + def tombstone_delete(self, node_id): log.info(f"tombstone_delete({node_id})") self.request( @@ -4353,6 +4370,8 @@ class Endpoint(PgProtocol, LogUtils): basebackup_request_tries: int | None = None, timeout: str | None = None, env: dict[str, str] | None = None, + autoprewarm: bool = False, + offload_lfc_interval_seconds: int | None = None, ) -> Self: """ Start the Postgres instance. @@ -4377,6 +4396,8 @@ class Endpoint(PgProtocol, LogUtils): basebackup_request_tries=basebackup_request_tries, timeout=timeout, env=env, + autoprewarm=autoprewarm, + offload_lfc_interval_seconds=offload_lfc_interval_seconds, ) self._running.release(1) self.log_config_value("shared_buffers") @@ -4592,6 +4613,8 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, + autoprewarm: bool = False, + offload_lfc_interval_seconds: int | None = None, ) -> Self: """ Create an endpoint, apply config, and start Postgres. @@ -4612,6 +4635,8 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, + autoprewarm=autoprewarm, + offload_lfc_interval_seconds=offload_lfc_interval_seconds, ) return self @@ -4696,6 +4721,8 @@ class EndpointFactory: remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, + autoprewarm: bool = False, + offload_lfc_interval_seconds: int | None = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -4717,6 +4744,8 @@ class EndpointFactory: remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, + autoprewarm=autoprewarm, + offload_lfc_interval_seconds=offload_lfc_interval_seconds, ) def create( diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 9b564f0a60..0e4dd571c0 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,15 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + # Can happen in tests that purposely wipe pageserver "local disk" data. + ".*Local data loss suspected.*", + # Too many frozen layers error is normal during intensive benchmarks + ".*too many frozen layers.*", + # Transient errors when resolving tenant shards by page service + ".*Fail to resolve tenant shard in attempt.*", + # Expected warnings when pageserver has not refreshed GC info yet + ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*", + ".*No broker updates received for a while.*", *( [ r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 839e985419..942b620be6 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -112,12 +112,18 @@ class TimelineCreateRequest: class TimelineMembershipSwitchResponse: previous_conf: MembershipConfiguration current_conf: MembershipConfiguration + last_log_term: int + flush_lsn: Lsn @classmethod def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: previous_conf = MembershipConfiguration.from_json(d["previous_conf"]) current_conf = MembershipConfiguration.from_json(d["current_conf"]) - return TimelineMembershipSwitchResponse(previous_conf, current_conf) + last_log_term = d["last_log_term"] + flush_lsn = Lsn(d["flush_lsn"]) + return TimelineMembershipSwitchResponse( + previous_conf, current_conf, last_log_term, flush_lsn + ) class SafekeeperHttpClient(requests.Session, MetricsGetter): diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 41696bf887..8e7055ef78 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants( @pytest.mark.parametrize("duration", [20 * 60]) @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) # we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability -# we use 64 clients because typically for a high number of connections we recommend the connection pooler -# which by default uses 64 connections -@pytest.mark.parametrize("n_clients", [1, 64]) +# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type, +# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered +# a good utilization for pageserver. +@pytest.mark.parametrize("n_clients", [1, 8]) @pytest.mark.parametrize("n_tenants", [1]) @pytest.mark.timeout(2400) def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( @@ -70,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man n_clients: int, ): setup_and_run_pagebench_benchmark( - neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + neon_env_builder, + zenbenchmark, + pg_bin, + n_tenants, + pgbench_scale, + duration, + n_clients, ) @@ -85,7 +92,8 @@ def setup_and_run_pagebench_benchmark( ): def record(metric, **kwargs): zenbenchmark.record( - metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs + metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", + **kwargs, ) params: dict[str, tuple[Any, dict[str, Any]]] = {} @@ -103,9 +111,7 @@ def setup_and_run_pagebench_benchmark( # configure cache sizes like in prod page_cache_size = 16384 max_file_descriptors = 500000 - neon_env_builder.pageserver_config_override = ( - f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" - ) + neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}" tracing_config = PageserverTracingConfig( sampling_ratio=(0, 1000), @@ -121,7 +127,10 @@ def setup_and_run_pagebench_benchmark( page_cache_size * 8192, {"unit": "byte"}, ), - "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config_override.max_file_descriptors": ( + max_file_descriptors, + {"unit": ""}, + ), "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}), } ) diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 920c538069..011c6896bd 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -416,6 +416,8 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): # timeline creation (uploads). mask it out here to avoid flakyness. del success_result["remote_consistent_lsn_visible"] del repeat_result["remote_consistent_lsn_visible"] + del success_result["walreceiver_status"] + del repeat_result["walreceiver_status"] assert repeat_result == success_result finally: env.pageserver.stop(immediate=True) diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py index e1058cd644..ae36bbda79 100644 --- a/test_runner/regress/test_lfc_prewarm.py +++ b/test_runner/regress/test_lfc_prewarm.py @@ -1,61 +1,122 @@ import random import threading -import time -from enum import Enum +from enum import StrEnum +from typing import Any import pytest from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import USE_LFC +from fixtures.utils import USE_LFC, wait_until from prometheus_client.parser import text_string_to_metric_families as prom_parse_impl +from psycopg2.extensions import cursor as Cursor -class LfcQueryMethod(Enum): - COMPUTE_CTL = False - POSTGRES = True +class PrewarmMethod(StrEnum): + POSTGRES = "postgres" + COMPUTE_CTL = "compute-ctl" + AUTOPREWARM = "autoprewarm" -PREWARM_LABEL = "compute_ctl_lfc_prewarm_requests_total" -OFFLOAD_LABEL = "compute_ctl_lfc_offload_requests_total" -QUERY_OPTIONS = LfcQueryMethod.POSTGRES, LfcQueryMethod.COMPUTE_CTL +PREWARM_LABEL = "compute_ctl_lfc_prewarms_total" +PREWARM_ERR_LABEL = "compute_ctl_lfc_prewarm_errors_total" +OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total" +OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total" +METHOD_VALUES = [e for e in PrewarmMethod] +METHOD_IDS = [e.value for e in PrewarmMethod] -def check_pinned_entries(cur): - # some LFC buffer can be temporary locked by autovacuum or background writer - for _ in range(10): +def check_pinned_entries(cur: Cursor): + """ + Wait till none of LFC buffers are pinned + """ + + def none_pinned(): cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'") - n_pinned = cur.fetchall()[0][0] - if n_pinned == 0: - break - time.sleep(1) - assert n_pinned == 0 + assert cur.fetchall()[0][0] == 0 + + wait_until(none_pinned) def prom_parse(client: EndpointHttpClient) -> dict[str, float]: + labels = PREWARM_LABEL, OFFLOAD_LABEL, PREWARM_ERR_LABEL, OFFLOAD_ERR_LABEL return { - sample.name: sample.value + sample.name: int(sample.value) for family in prom_parse_impl(client.metrics()) for sample in family.samples - if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL) + if sample.name in labels } +def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any: + if method == PrewarmMethod.AUTOPREWARM: + client.offload_lfc_wait() + elif method == PrewarmMethod.COMPUTE_CTL: + status = client.prewarm_lfc_status() + assert status["status"] == "not_prewarmed" + assert "error" not in status + client.offload_lfc() + assert client.prewarm_lfc_status()["status"] == "not_prewarmed" + parsed = prom_parse(client) + desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0} + assert parsed == desired, f"{parsed=} != {desired=}" + elif method == PrewarmMethod.POSTGRES: + cur.execute("select get_local_cache_state()") + return cur.fetchall()[0][0] + else: + raise AssertionError(f"{method} not in PrewarmMethod") + + +def prewarm_endpoint( + method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor, lfc_state: str | None +): + if method == PrewarmMethod.AUTOPREWARM: + client.prewarm_lfc_wait() + elif method == PrewarmMethod.COMPUTE_CTL: + client.prewarm_lfc() + elif method == PrewarmMethod.POSTGRES: + cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + + +def check_prewarmed( + method: PrewarmMethod, client: EndpointHttpClient, desired_status: dict[str, str | int] +): + if method == PrewarmMethod.AUTOPREWARM: + assert client.prewarm_lfc_status() == desired_status + assert prom_parse(client)[PREWARM_LABEL] == 1 + elif method == PrewarmMethod.COMPUTE_CTL: + assert client.prewarm_lfc_status() == desired_status + desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0} + assert prom_parse(client) == desired + + @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) -def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): +@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS) +def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod): + """ + Test we can offload endpoint's LFC cache to endpoint storage. + Test we can prewarm endpoint with LFC cache loaded from endpoint storage. + """ env = neon_simple_env n_records = 1000000 - endpoint = env.endpoints.create_start( - branch_name="main", - config_lines=[ - "autovacuum = off", - "shared_buffers=1MB", - "neon.max_file_cache_size=1GB", - "neon.file_cache_size_limit=1GB", - "neon.file_cache_prewarm_limit=1000", - ], - ) + cfg = [ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000", + ] + offload_secs = 2 + + if method == PrewarmMethod.AUTOPREWARM: + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=cfg, + autoprewarm=True, + offload_lfc_interval_seconds=offload_secs, + ) + else: + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() @@ -69,75 +130,64 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, query: LfcQueryMethod): lfc_cur.execute(f"insert into t (pk) values (generate_series(1,{n_records}))") log.info(f"Inserted {n_records} rows") - http_client = endpoint.http_client() - if query is LfcQueryMethod.COMPUTE_CTL: - status = http_client.prewarm_lfc_status() - assert status["status"] == "not_prewarmed" - assert "error" not in status - http_client.offload_lfc() - assert http_client.prewarm_lfc_status()["status"] == "not_prewarmed" - assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0} - else: - pg_cur.execute("select get_local_cache_state()") - lfc_state = pg_cur.fetchall()[0][0] + client = endpoint.http_client() + lfc_state = offload_lfc(method, client, pg_cur) endpoint.stop() - endpoint.start() + if method == PrewarmMethod.AUTOPREWARM: + endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs) + else: + endpoint.start() pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() lfc_conn = endpoint.connect(dbname="lfc") lfc_cur = lfc_conn.cursor() - - if query is LfcQueryMethod.COMPUTE_CTL: - http_client.prewarm_lfc() - else: - pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) + prewarm_endpoint(method, client, pg_cur, lfc_state) pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'") lfc_used_pages = pg_cur.fetchall()[0][0] log.info(f"Used LFC size: {lfc_used_pages}") pg_cur.execute("select * from get_prewarm_info()") - prewarm_info = pg_cur.fetchall()[0] - log.info(f"Prewarm info: {prewarm_info}") - total, prewarmed, skipped, _ = prewarm_info + total, prewarmed, skipped, _ = pg_cur.fetchall()[0] + log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}") progress = (prewarmed + skipped) * 100 // total log.info(f"Prewarm progress: {progress}%") - assert lfc_used_pages > 10000 - assert ( - prewarm_info[0] > 0 - and prewarm_info[1] > 0 - and prewarm_info[0] == prewarm_info[1] + prewarm_info[2] - ) + assert total > 0 + assert prewarmed > 0 + assert total == prewarmed + skipped lfc_cur.execute("select sum(pk) from t") assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2 check_pinned_entries(pg_cur) - desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped} - if query is LfcQueryMethod.COMPUTE_CTL: - assert http_client.prewarm_lfc_status() == desired - assert prom_parse(http_client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1} + check_prewarmed(method, client, desired) + + +# autoprewarm isn't needed as we prewarm manually +WORKLOAD_VALUES = METHOD_VALUES[:-1] +WORKLOAD_IDS = METHOD_IDS[:-1] @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") -@pytest.mark.parametrize("query", QUERY_OPTIONS, ids=["postgres", "compute-ctl"]) -def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMethod): +@pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS) +def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod): + """ + Test continiously prewarming endpoint when there is a write-heavy workload going in parallel + """ env = neon_simple_env n_records = 10000 n_threads = 4 - endpoint = env.endpoints.create_start( - branch_name="main", - config_lines=[ - "shared_buffers=1MB", - "neon.max_file_cache_size=1GB", - "neon.file_cache_size_limit=1GB", - "neon.file_cache_prewarm_limit=1000000", - ], - ) + cfg = [ + "shared_buffers=1MB", + "neon.max_file_cache_size=1GB", + "neon.file_cache_size_limit=1GB", + "neon.file_cache_prewarm_limit=1000000", + ] + endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg) pg_conn = endpoint.connect() pg_cur = pg_conn.cursor() @@ -154,12 +204,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet log.info(f"Inserted {n_records} rows") http_client = endpoint.http_client() - if query is LfcQueryMethod.COMPUTE_CTL: - http_client.offload_lfc() - else: - pg_cur.execute("select get_local_cache_state()") - lfc_state = pg_cur.fetchall()[0][0] - + lfc_state = offload_lfc(method, http_client, pg_cur) running = True n_prewarms = 0 @@ -170,8 +215,8 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet while running: src = random.randint(1, n_records) dst = random.randint(1, n_records) - lfc_cur.execute("update accounts set balance=balance-100 where id=%s", (src,)) - lfc_cur.execute("update accounts set balance=balance+100 where id=%s", (dst,)) + lfc_cur.execute(f"update accounts set balance=balance-100 where id={src}") + lfc_cur.execute(f"update accounts set balance=balance+100 where id={dst}") n_transfers += 1 log.info(f"Number of transfers: {n_transfers}") @@ -183,13 +228,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet pg_cur.execute("select pg_reload_conf()") pg_cur.execute("alter system set neon.file_cache_size_limit='1GB'") pg_cur.execute("select pg_reload_conf()") - - if query is LfcQueryMethod.COMPUTE_CTL: - # Same thing as prewarm_lfc(), testing other method - http_client.prewarm_lfc(endpoint.endpoint_id) - else: - pg_cur.execute("select prewarm_local_cache(%s)", (lfc_state,)) - + prewarm_endpoint(method, http_client, pg_cur, lfc_state) nonlocal n_prewarms n_prewarms += 1 log.info(f"Number of prewarms: {n_prewarms}") @@ -203,7 +242,10 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet prewarm_thread = threading.Thread(target=prewarm) prewarm_thread.start() - time.sleep(20) + def prewarmed(): + assert n_prewarms > 5 + + wait_until(prewarmed) running = False for t in workload_threads: @@ -215,5 +257,12 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, query: LfcQueryMet assert total_balance == 0 check_pinned_entries(pg_cur) - if query is LfcQueryMethod.COMPUTE_CTL: - assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms} + if method == PrewarmMethod.POSTGRES: + return + desired = { + OFFLOAD_LABEL: 1, + PREWARM_LABEL: n_prewarms, + OFFLOAD_ERR_LABEL: 0, + PREWARM_ERR_LABEL: 0, + } + assert prom_parse(http_client) == desired diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 474258c9eb..52c33687ae 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -180,7 +180,7 @@ def test_metric_collection( httpserver.check() # Check that at least one bucket output object is present, and that all - # can be decompressed and decoded. + # can be decompressed and decoded as NDJSON. bucket_dumps = {} assert isinstance(env.pageserver_remote_storage, LocalFsStorage) for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): @@ -188,7 +188,13 @@ def test_metric_collection( file_path = os.path.join(dirpath, file) log.info(file_path) if file.endswith(".gz"): - bucket_dumps[file_path] = json.load(gzip.open(file_path)) + events = [] + with gzip.open(file_path, "rt") as f: + for line in f: + line = line.strip() + if line: + events.append(json.loads(line)) + bucket_dumps[file_path] = {"events": events} assert len(bucket_dumps) >= 1 assert all("events" in data for data in bucket_dumps.values()) diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index 057371175c..b82d7b9bb0 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -27,6 +27,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): [ ".*Timeline .* was cancelled and cannot be used anymore.*", ".*Timeline .* has been deleted.*", + ".*Timeline .* was not found in global map.*", ".*wal receiver task finished with an error.*", ] ) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 93c621f564..8ff767eca4 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,8 +1,11 @@ from __future__ import annotations import os +import random +import threading import time from collections import defaultdict +from threading import Event from typing import TYPE_CHECKING, Any import pytest @@ -1505,6 +1508,171 @@ def test_sharding_split_failures( env.storage_controller.consistency_check() +@pytest.mark.skip(reason="The backpressure change has not been merged yet.") +def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder): + """ + Test backpressure can ignore new shards during tenant split so that if we abort the split, + PG can continue without being blocked. + """ + DBNAME = "regression" + + init_shard_count = 4 + neon_env_builder.num_pageservers = init_shard_count + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create( + "main", + config_lines=[ + "max_replication_write_lag = 1MB", + "databricks.max_wal_mb_per_second = 1", + "neon.max_cluster_size = 10GB", + ], + ) + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);") + write_done = Event() + + def write_data(write_done): + while not write_done.is_set(): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + log.info("write_data thread exiting") + + writer_thread = threading.Thread(target=write_data, args=(write_done,)) + writer_thread.start() + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + write_done.set() + writer_thread.join() + + # writing more data to page servers after split is aborted + for _i in range(5000): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + + # wait until write lag becomes 0 + def check_write_lag_is_zero(): + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag + FROM neon.backpressure_lsns(); + """, + dbname="databricks_system", + log_query=False, + ) + log.info(f"received_lsn_lag = {res[0][0]}") + assert res[0][0] == 0 + + wait_until(check_write_lag_is_zero) + endpoint.stop_and_destroy() + + +# BEGIN_HADRON +def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder): + """ + Tests that page service is able to resolve the correct shard during tenant split without causing query errors + """ + DBNAME = "regression" + WORKER_THREADS = 16 + ROW_COUNT = 10000 + + init_shard_count = 4 + neon_env_builder.num_pageservers = 1 + stripe_size = 16 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + # generate 10MB of data + endpoint.safe_psql( + f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;" + ) + read_done = Event() + + def read_data(read_done): + i = 0 + while not read_done.is_set() or i < 10: + endpoint.safe_psql( + f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}", + log_query=False, + ) + i += 1 + log.info(f"read_data thread exiting. Executed {i} queries.") + + reader_threads = [] + for _i in range(WORKER_THREADS): + reader_thread = threading.Thread(target=read_data, args=(read_done,)) + reader_thread.start() + reader_threads.append(reader_thread) + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + # wait until abort is done + def check_tenant_status(): + active_count = 0 + for i in range(init_shard_count): + status = env.pageserver.http_client().tenant_status( + TenantShardId(env.initial_tenant, i, init_shard_count) + ) + if status["state"]["slug"] == "Active": + active_count += 1 + assert active_count == 4 + + wait_until(check_tenant_status) + + read_done.set() + for thread in reader_threads: + thread.join() + + endpoint.stop() + + +# END_HADRON + + def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): """ Check a scenario when one of the shards is much slower than others. diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 290ebe456b..10845ef02e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -989,6 +989,105 @@ def test_storage_controller_compute_hook_retry( ) +@run_only_on_default_postgres("postgres behavior is not relevant") +def test_storage_controller_compute_hook_keep_failing( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address: ListenAddress, +): + neon_env_builder.num_pageservers = 4 + neon_env_builder.storage_controller_config = {"use_local_compute_notifications": False} + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" + + # Set up CP handler for compute notifications + status_by_tenant: dict[TenantId, int] = {} + + def handler(request: Request): + notify_request = request.json + assert notify_request is not None + status = status_by_tenant[TenantId(notify_request["tenant_id"])] + log.info(f"Notify request[{status}]: {notify_request}") + return Response(status=status) + + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + + # Run neon environment + env = neon_env_builder.init_configs() + env.start() + + # Create two tenants: + # - The first tenant is banned by CP and contains only one shard + # - The second tenant is allowed by CP and contains four shards + banned_tenant = TenantId.generate() + status_by_tenant[banned_tenant] = 200 # we will ban this tenant later + env.create_tenant(banned_tenant, placement_policy='{"Attached": 1}') + + shard_count = 4 + allowed_tenant = TenantId.generate() + status_by_tenant[allowed_tenant] = 200 + env.create_tenant(allowed_tenant, shard_count=shard_count, placement_policy='{"Attached": 1}') + + # Find the pageserver of the banned tenant + banned_tenant_ps = env.get_tenant_pageserver(banned_tenant) + assert banned_tenant_ps is not None + alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id] + + # Stop pageserver and ban tenant to trigger failed reconciliation + log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}") + status_by_tenant[banned_tenant] = 423 + banned_tenant_ps.stop() + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*") + env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*") + env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"}) + + # Migrate all allowed tenant shards to the first alive pageserver + # to trigger storage controller optimizations due to affinity rules + for shard_number in range(shard_count): + log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}") + env.storage_controller.tenant_shard_migrate( + TenantShardId(allowed_tenant, shard_number, shard_count), + alive_pageservers[0].id, + config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True), + ) + + # Make some reconcile_all calls to trigger optimizations + # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS + RECONCILE_COUNT = 12 + for i in range(RECONCILE_COUNT): + try: + n = env.storage_controller.reconcile_all() + log.info(f"Reconciliation attempt {i} finished with success: {n}") + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + log.info(f"Reconciliation attempt {i} finished with failure") + + banned_descr = env.storage_controller.tenant_describe(banned_tenant) + assert banned_descr["shards"][0]["is_pending_compute_notification"] is True + time.sleep(2) + + # Check that the allowed tenant shards are optimized due to affinity rules + locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"] + not_optimized_shard_count = 0 + for loc in locations: + tsi = TenantShardId.parse(loc[0]) + if tsi.tenant_id != allowed_tenant: + continue + if loc[1]["mode"] == "AttachedSingle": + not_optimized_shard_count += 1 + log.info(f"Shard {tsi} seen in mode {loc[1]['mode']}") + + assert not_optimized_shard_count < shard_count, "At least one shard should be optimized" + + # Unban the tenant and run reconciliations + status_by_tenant[banned_tenant] = 200 + env.storage_controller.reconcile_all() + banned_descr = env.storage_controller.tenant_describe(banned_tenant) + assert banned_descr["shards"][0]["is_pending_compute_notification"] is False + + @run_only_on_default_postgres("this test doesn't start an endpoint") def test_storage_controller_compute_hook_revert( httpserver: HTTPServer, @@ -2522,7 +2621,7 @@ def test_storage_controller_node_deletion( wait_until(assert_shards_migrated) log.info(f"Deleting pageserver {victim.id}") - env.storage_controller.node_delete(victim.id) + env.storage_controller.node_delete_old(victim.id) if not while_offline: @@ -2557,6 +2656,60 @@ def test_storage_controller_node_deletion( env.storage_controller.consistency_check() +def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 3 + neon_env_builder.num_azs = 3 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 12 + shard_count_per_tenant = 16 + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # Sanity check: initial creations should not leave the system in an unstable scheduling state + assert env.storage_controller.reconcile_all() == 0 + + nodes = env.storage_controller.node_list() + assert len(nodes) == 3 + + env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)")) + + ps_id_to_delete = env.pageservers[0].id + + env.storage_controller.warm_up_all_secondaries() + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_delete(ps_id), + ps_id_to_delete, + max_attempts=3, + backoff=2, + ) + + env.storage_controller.poll_node_status( + ps_id_to_delete, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.DELETING, + max_attempts=6, + backoff=2, + ) + + env.storage_controller.cancel_node_delete(ps_id_to_delete) + + env.storage_controller.poll_node_status( + ps_id_to_delete, + PageserverAvailability.ACTIVE, + PageserverSchedulingPolicy.ACTIVE, + max_attempts=6, + backoff=2, + ) + + @pytest.mark.parametrize("shard_count", [None, 2]) def test_storage_controller_metadata_health( neon_env_builder: NeonEnvBuilder, @@ -3112,7 +3265,7 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder): assert_nodes_count(3) ps = env.pageservers[0] - env.storage_controller.node_delete(ps.id) + env.storage_controller.node_delete_old(ps.id) # After deletion, the node count must be reduced assert_nodes_count(2) @@ -3530,18 +3683,21 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): # some small tests for the scheduling policy querying and returning APIs newest_info = target.get_safekeeper(inserted["id"]) assert newest_info - assert newest_info["scheduling_policy"] == "Pause" - target.safekeeper_scheduling_policy(inserted["id"], "Active") - newest_info = target.get_safekeeper(inserted["id"]) - assert newest_info - assert newest_info["scheduling_policy"] == "Active" - # Ensure idempotency - target.safekeeper_scheduling_policy(inserted["id"], "Active") - newest_info = target.get_safekeeper(inserted["id"]) - assert newest_info - assert newest_info["scheduling_policy"] == "Active" - # change back to paused again + assert ( + newest_info["scheduling_policy"] == "Activating" + or newest_info["scheduling_policy"] == "Active" + ) target.safekeeper_scheduling_policy(inserted["id"], "Pause") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Pause" + # Ensure idempotency + target.safekeeper_scheduling_policy(inserted["id"], "Pause") + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Pause" + # change back to active again + target.safekeeper_scheduling_policy(inserted["id"], "Active") def storcon_heartbeat(): assert env.storage_controller.log_contains( @@ -3554,6 +3710,57 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_safekeeper_activating_to_active(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + + fake_id = 5 + + target = env.storage_controller + + assert target.get_safekeeper(fake_id) is None + + start_sks = target.get_safekeepers() + + sk_0 = env.safekeepers[0] + + body = { + "active": True, + "id": fake_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-eu-central-1", + "host": "localhost", + "port": sk_0.port.pg, + "http_port": sk_0.port.http, + "https_port": None, + "version": 5957, + "availability_zone_id": "eu-central-1a", + } + + target.on_safekeeper_deploy(fake_id, body) + + inserted = target.get_safekeeper(fake_id) + assert inserted is not None + assert target.get_safekeepers() == start_sks + [inserted] + assert eq_safekeeper_records(body, inserted) + + def safekeeper_is_active(): + newest_info = target.get_safekeeper(inserted["id"]) + assert newest_info + assert newest_info["scheduling_policy"] == "Active" + + wait_until(safekeeper_is_active) + + target.safekeeper_scheduling_policy(inserted["id"], "Activating") + + wait_until(safekeeper_is_active) + + # Now decomission it + target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned") + + def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 190dd914ee..8b291b7cbe 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, "pitr_interval": "0s" if zero_gc else "3600s", "gc_period": "0s", "compaction_period": "0s", + # The test exercises leases API, so we need non-zero lease length. + # If this tests ever does GC, we need to accomodate for the initial lease deadline + # after tenant attach, which is also controlled by this variable. + "lsn_lease_length": "600s", } env = neon_env_builder.init_start(initial_tenant_conf=conf) @@ -824,9 +828,7 @@ def insert_with_action( log.info(f"initial size: {initial_size}") with ep.cursor() as cur: - cur.execute( - "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" - ) + cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)") last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) if action == "lease": @@ -841,15 +843,9 @@ def insert_with_action( raise AssertionError("Invalid action type, only `lease` and `branch`are accepted") with ep.cursor() as cur: - cur.execute( - "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" - ) - cur.execute( - "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" - ) - cur.execute( - "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" - ) + cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)") + cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)") + cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)") last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b5cc431afe..c0f163db32 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) - now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + now = datetime.datetime.now(datetime.UTC) assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: @@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots env.pageserver.quiesce_tenants() - # checking the ancestor after is much faster than waiting for the endpoint not start + # checking the ancestor after is much faster than waiting for the endpoint to start expected_result = [ - ("main", env.initial_timeline, None, 24576, 1), - ("after", after, env.initial_timeline, 24576, 1), - ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1), - ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1), - ("branch_to_detach", branch_to_detach, None, 16384, 1), - ("earlier", earlier, env.initial_timeline, 0, 1), + # (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only) + ("main", env.initial_timeline, None, 24576, 1, False), + ("after", after, env.initial_timeline, 24576, 1, False), + ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False), + ("branch_to_detach", branch_to_detach, None, 16384, 1, False), + ("earlier", earlier, env.initial_timeline, 0, 1, False), ] assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result: details = client.timeline_detail(env.initial_tenant, queried_timeline) ancestor_timeline_id = details["ancestor_timeline_id"] if expected_ancestor is None: @@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots assert len(lineage.get("original_ancestor", [])) == 0 assert len(lineage.get("reparenting_history", [])) == 0 - for branch_name, queried_timeline, _, rows, starts in expected_result: - details = client.timeline_detail(env.initial_tenant, queried_timeline) - log.info(f"reading data from branch {branch_name}") - # specifying the lsn makes the endpoint read-only and not connect to safekeepers + for branch_name, queried_timeline, _, rows, starts, read_only in expected_result: + last_record_lsn = None + if read_only: + # specifying the lsn makes the endpoint read-only and not connect to safekeepers + details = client.timeline_detail(env.initial_tenant, queried_timeline) + last_record_lsn = Lsn(details["last_record_lsn"]) + + log.info(f"reading data from branch {branch_name} at {last_record_lsn}") with env.endpoints.create( branch_name, - lsn=Lsn(details["last_record_lsn"]), + lsn=last_record_lsn, ) as ep: ep.start(safekeeper_generation=1) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows @@ -1884,6 +1889,31 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) +def test_detach_ancestors_with_no_writes( + neon_env_builder: NeonEnvBuilder, +): + env = neon_env_builder.init_start() + + endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.safe_psql( + "SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')" + ) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + endpoint.stop() + + for i in range(0, 5): + if i == 0: + ancestor_name = "main" + else: + ancestor_name = f"b{i}" + + tlid = env.create_branch(f"b{i + 1}", ancestor_branch_name=ancestor_name) + + client = env.pageserver.http_client() + client.detach_ancestor(tenant_id=env.initial_tenant, timeline_id=tlid) + + # TODO: # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index ea120c1814..22e6d2e1c3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2740,3 +2740,85 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde raise Exception("Uneviction did not happen on source safekeeper yet") wait_until(unevicted) + + +def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder): + """ + Test that the timeline disk usage circuit breaker works as expected. We test that: + 1. The circuit breaker kicks in when the timeline's disk usage exceeds the configured limit, + and it causes writes to hang. + 2. The hanging writes unblock when the issue resolves (by restarting the safekeeper in the + test to simulate a more realistic production troubleshooting scenario). + 3. We can continue to write as normal after the issue resolves. + 4. There is no data corruption throughout the test. + """ + # Set up environment with a very small disk usage limit (1KB) + neon_env_builder.num_safekeepers = 1 + remote_storage_kind = s3_storage() + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) + + # Set a very small disk usage limit (1KB) + neon_env_builder.safekeeper_extra_opts = ["--max-timeline-disk-usage-bytes=1024"] + + env = neon_env_builder.init_start() + + # Create a timeline and endpoint + env.create_branch("test_timeline_disk_usage_limit") + endpoint = env.endpoints.create_start("test_timeline_disk_usage_limit") + + # Get the safekeeper + sk = env.safekeepers[0] + + # Inject a failpoint to stop WAL backup + with sk.http_client() as http_cli: + http_cli.configure_failpoints([("backup-lsn-range-pausable", "pause")]) + + # Write some data that will exceed the 1KB limit. While the failpoint is active, this operation + # will hang as Postgres encounters safekeeper-returned errors and retries. + def run_hanging_insert(): + with closing(endpoint.connect()) as bg_conn: + with bg_conn.cursor() as bg_cur: + # This should generate more than 1KB of WAL + bg_cur.execute("create table t(key int, value text)") + bg_cur.execute("insert into t select generate_series(1,2000), 'payload'") + + # Start the inserts in a background thread + bg_thread = threading.Thread(target=run_hanging_insert) + bg_thread.start() + + # Wait for the error message to appear in the compute log + def error_logged(): + return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None + + wait_until(error_logged) + log.info("Found expected error message in compute log, resuming.") + + # Sanity check that the hanging insert is indeed still hanging. Otherwise means the circuit breaker we + # implemented didn't work as expected. + time.sleep(2) + assert bg_thread.is_alive(), ( + "The hanging insert somehow unblocked without resolving the disk usage issue!" + ) + + log.info("Restarting the safekeeper to resume WAL backup.") + # Restart the safekeeper with defaults to both clear the failpoint and resume the larger disk usage limit. + for sk in env.safekeepers: + sk.stop().start(extra_opts=[]) + + # The hanging insert will now complete. Join the background thread so that we can + # verify that the insert completed successfully. + bg_thread.join(timeout=120) + assert not bg_thread.is_alive(), "Hanging insert did not complete after safekeeper restart" + log.info("Hanging insert unblocked.") + + # Verify we can continue to write as normal + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t select generate_series(2001,3000), 'payload'") + + # Sanity check data correctness + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from t") + # 2000 rows from first insert + 1000 from last insert + assert cur.fetchone() == (3000,) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index d281c055b0..72fc58d761 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -13,50 +13,6 @@ if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -# Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. -# Ensures that walreceiver does not run without any data inserted and only starts after the insertion. -def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): - # we assert below that the walreceiver is not active before data writes. - # with manually created timelines, it is active. - # FIXME: remove this test once we remove timelines_onto_safekeepers - neon_env_builder.storage_controller_config = { - "timelines_onto_safekeepers": False, - } - - # Trigger WAL wait timeout faster - neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" - env = neon_env_builder.init_start() - env.pageserver.http_client() - - # In this test we force 'Timed out while waiting for WAL record error' while - # fetching basebackup and don't want any retries. - os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1" - - tenant_id, timeline_id = env.create_tenant() - expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" - env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") - - try: - trigger_wait_lsn_timeout(env, tenant_id) - except Exception as e: - exception_string = str(e) - assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" - assert "WalReceiver status: Not active" in exception_string, ( - "Walreceiver should not be active before any data writes" - ) - - insert_test_elements(env, tenant_id, start=0, count=1_000) - try: - trigger_wait_lsn_timeout(env, tenant_id) - except Exception as e: - exception_string = str(e) - assert expected_timeout_error in exception_string, "Should time out during waiting for WAL" - assert "WalReceiver status: Not active" not in exception_string, ( - "Should not be inactive anymore after INSERTs are made" - ) - assert "WalReceiver status" in exception_string, "But still should have some other status" - - # Checks that all active safekeepers are shown in pageserver's walreceiver state printed on WAL wait timeout. # Kills one of the safekeepers and ensures that only the active ones are printed in the state. def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/sql_regress/expected/neon-subxacts.out b/test_runner/sql_regress/expected/neon-subxacts.out new file mode 100644 index 0000000000..5ed8cfcac9 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-subxacts.out @@ -0,0 +1,21 @@ +DO $$ +DECLARE +i numeric; +BEGIN + create role somebody; + FOR i IN 1..1000000 LOOP + BEGIN + IF i % 1000 = 0 THEN + alter role somebody password 'welcome'; + ELSE + PERFORM 1; + END IF; + EXCEPTION WHEN OTHERS THEN + RAISE WARNING 'error'; + END; + IF I = 1000000 THEN + PERFORM pg_log_backend_memory_contexts(pg_backend_pid()); + END IF; + END LOOP; +END; +$$; diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index d1bd7226ed..0ce9f0e28f 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -10,3 +10,4 @@ test: neon-clog test: neon-test-utils test: neon-vacuum-full test: neon-event-triggers +test: neon-subxacts diff --git a/test_runner/sql_regress/sql/neon-subxacts.sql b/test_runner/sql_regress/sql/neon-subxacts.sql new file mode 100644 index 0000000000..5ed8cfcac9 --- /dev/null +++ b/test_runner/sql_regress/sql/neon-subxacts.sql @@ -0,0 +1,21 @@ +DO $$ +DECLARE +i numeric; +BEGIN + create role somebody; + FOR i IN 1..1000000 LOOP + BEGIN + IF i % 1000 = 0 THEN + alter role somebody password 'welcome'; + ELSE + PERFORM 1; + END IF; + EXCEPTION WHEN OTHERS THEN + RAISE WARNING 'error'; + END; + IF I = 1000000 THEN + PERFORM pg_log_backend_memory_contexts(pg_backend_pid()); + END IF; + END LOOP; +END; +$$; diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e9a77ca2d6..fc01deb92d 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -40,8 +40,10 @@ env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } +futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } @@ -68,6 +70,7 @@ num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } +once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } @@ -112,10 +115,13 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std [build-dependencies] ahash = { version = "0.8" } +anstream = { version = "0.6" } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } +clap = { version = "4", features = ["derive", "env", "string"] } +clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } @@ -133,6 +139,7 @@ num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } +once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } @@ -142,6 +149,7 @@ regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } +serde_json = { version = "1", features = ["alloc", "raw_value"] } syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } toml_edit = { version = "0.22", features = ["serde"] }