diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 3f66f41ef2..e2203a38ec 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -104,11 +104,10 @@ jobs: # Set some environment variables used by all the steps. # - # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. - # It also includes --features, if any + # CARGO_FLAGS is extra options to pass to all "cargo" subcommands. # - # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, - # because "cargo metadata" doesn't accept --release or --debug options + # CARGO_PROFILE is passed to "cargo build", "cargo test" etc, but not to + # "cargo metadata", because it doesn't accept --release or --debug options. # # We run tests with addtional features, that are turned off by default (e.g. in release builds), see # corresponding Cargo.toml files for their descriptions. @@ -117,16 +116,16 @@ jobs: ARCH: ${{ inputs.arch }} SANITIZERS: ${{ inputs.sanitizers }} run: | - CARGO_FEATURES="--features testing" + CARGO_FLAGS="--locked --features testing" if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked" + CARGO_PROFILE="" elif [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="" - CARGO_FLAGS="--locked" + CARGO_PROFILE="" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release" + CARGO_PROFILE="--release" fi if [[ $SANITIZERS == 'enabled' ]]; then make_vars="WITH_SANITIZERS=yes" @@ -136,8 +135,8 @@ jobs: { echo "cov_prefix=${cov_prefix}" echo "make_vars=${make_vars}" - echo "CARGO_FEATURES=${CARGO_FEATURES}" echo "CARGO_FLAGS=${CARGO_FLAGS}" + echo "CARGO_PROFILE=${CARGO_PROFILE}" echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" } >> $GITHUB_ENV @@ -189,34 +188,18 @@ jobs: path: pg_install/v17 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }} - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v14 -j$(nproc) - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v15 -j$(nproc) - - - name: Build postgres v16 - if: steps.cache_pg_16.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v16 -j$(nproc) - - - name: Build postgres v17 - if: steps.cache_pg_17.outputs.cache-hit != 'true' - run: mold -run make ${make_vars} postgres-v17 -j$(nproc) - - - name: Build neon extensions - run: mold -run make ${make_vars} neon-pg-ext -j$(nproc) + - name: Build all + # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables + run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS" - name: Build walproposer-lib run: mold -run make ${make_vars} walproposer-lib -j$(nproc) - - name: Run cargo build - env: - WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }} + - name: Build unit tests + if: inputs.sanitizers != 'enabled' run: | export ASAN_OPTIONS=detect_leaks=0 - ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS} + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_PROFILE --tests # Do install *before* running rust tests because they might recompile the # binaries with different features/flags. @@ -228,7 +211,7 @@ jobs: # Install target binaries mkdir -p /tmp/neon/bin/ binaries=$( - ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + ${cov_prefix} cargo metadata $CARGO_FLAGS --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) for bin in $binaries; do @@ -245,7 +228,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_PROFILE --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do @@ -279,10 +262,10 @@ jobs: export LD_LIBRARY_PATH #nextest does not yet support running doctests - ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_PROFILE # run all non-pageserver tests - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E '!package(pageserver)' # run pageserver tests # (When developing new pageserver features gated by config fields, we commonly make the rust @@ -291,13 +274,13 @@ jobs: # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_s3)' # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -306,17 +289,17 @@ jobs: export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_azure)' - name: Install postgres binaries run: | # Use tar to copy files matching the pattern, preserving the paths in the destionation tar c \ pg_install/v* \ - pg_install/build/*/src/test/regress/*.so \ - pg_install/build/*/src/test/regress/pg_regress \ - pg_install/build/*/src/test/isolation/isolationtester \ - pg_install/build/*/src/test/isolation/pg_isolation_regress \ + build/*/src/test/regress/*.so \ + build/*/src/test/regress/pg_regress \ + build/*/src/test/isolation/isolationtester \ + build/*/src/test/isolation/pg_isolation_regress \ | tar x -C /tmp/neon - name: Upload Neon artifact diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 0f7fa3e813..160c3d05bc 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -110,7 +110,7 @@ jobs: build-walproposer-lib: if: | - inputs.pg_versions != '[]' || inputs.rebuild_everything || + contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' @@ -144,7 +144,7 @@ jobs: id: cache_walproposer_lib uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: - path: pg_install/build/walproposer-lib + path: build/walproposer-lib key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Checkout submodule vendor/postgres-v17 @@ -169,11 +169,11 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) - - name: Upload "pg_install/build/walproposer-lib" artifact + - name: Upload "build/walproposer-lib" artifact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - name: pg_install--build--walproposer-lib - path: pg_install/build/walproposer-lib + name: build--walproposer-lib + path: build/walproposer-lib # The artifact is supposed to be used by the next job in the same workflow, # so there’s no need to store it for too long. retention-days: 1 @@ -226,11 +226,11 @@ jobs: name: pg_install--v17 path: pg_install/v17 - - name: Download "pg_install/build/walproposer-lib" artifact + - name: Download "build/walproposer-lib" artifact uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 with: - name: pg_install--build--walproposer-lib - path: pg_install/build/walproposer-lib + name: build--walproposer-lib + path: build/walproposer-lib # `actions/download-artifact` doesn't preserve permissions: # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7faaed49c1..94f768719f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -670,7 +670,7 @@ jobs: ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: - needs: [ check-permissions, build-build-tools-image, meta ] + needs: [ check-permissions, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials @@ -743,7 +743,6 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true @@ -763,7 +762,6 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true diff --git a/.gitignore b/.gitignore index 45eb4dbf0e..70c7e96303 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /artifact_cache +/build /pg_install /target /tmp_check diff --git a/Cargo.lock b/Cargo.lock index 5ab26b02fa..8cc51350ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4255,6 +4255,7 @@ dependencies = [ "tokio-util", "tonic 0.13.1", "tracing", + "url", "utils", "workspace_hack", ] @@ -4472,6 +4473,8 @@ dependencies = [ "pageserver_api", "postgres_ffi", "prost 0.13.5", + "strum", + "strum_macros", "thiserror 1.0.69", "tokio", "tonic 0.13.1", diff --git a/Dockerfile b/Dockerfile index f72d7d9bbc..69657067de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,6 @@ COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE=release RUN set -e \ && mold -run make -j $(nproc) -s neon-pg-ext \ - && rm -rf pg_install/build \ && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . # Prepare cargo-chef recipe diff --git a/Makefile b/Makefile index 5130e17e59..9824a47255 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,18 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) -# Where to install Postgres, default is ./pg_install, maybe useful for package managers +# Where to install Postgres, default is ./pg_install, maybe useful for package +# managers. POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked` +# and `--features testing` are popular examples. +# +# CARGO_PROFILE: You can also set to override the cargo profile to +# use. By default, it is derived from BUILD_TYPE. + +# All intermediate build artifacts are stored here. +BUILD_DIR := build + ICU_PREFIX_DIR := /usr/local/icu # @@ -16,12 +26,12 @@ ifeq ($(BUILD_TYPE),release) PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS += -O2 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) - # Unfortunately, `--profile=...` is a nightly feature - CARGO_BUILD_FLAGS += --release + CARGO_PROFILE ?= --profile=release else ifeq ($(BUILD_TYPE),debug) PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS += -O0 -g3 $(CFLAGS) PG_LDFLAGS = $(LDFLAGS) + CARGO_PROFILE ?= --profile=dev else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif @@ -93,7 +103,7 @@ all: neon postgres neon-pg-ext .PHONY: neon neon: postgres-headers walproposer-lib cargo-target-dir +@echo "Compiling Neon" - $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) + $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE) .PHONY: cargo-target-dir cargo-target-dir: # https://github.com/rust-lang/cargo/issues/14281 @@ -104,21 +114,20 @@ cargo-target-dir: # Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # -$(POSTGRES_INSTALL_DIR)/build/%/config.status: - - mkdir -p $(POSTGRES_INSTALL_DIR) - test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG +$(BUILD_DIR)/%/config.status: + mkdir -p $(BUILD_DIR) + test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG +@echo "Configuring Postgres $* build" @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } - mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* + mkdir -p $(BUILD_DIR)/$* VERSION=$*; \ EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ - (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + (cd $(BUILD_DIR)/$$VERSION && \ env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ @@ -130,74 +139,54 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: # the "build-all-versions" entry points) where direct mention of PostgreSQL # versions is used. .PHONY: postgres-configure-v17 -postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status +postgres-configure-v17: $(BUILD_DIR)/v17/config.status .PHONY: postgres-configure-v16 -postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status +postgres-configure-v16: $(BUILD_DIR)/v16/config.status .PHONY: postgres-configure-v15 -postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +postgres-configure-v15: $(BUILD_DIR)/v15/config.status .PHONY: postgres-configure-v14 -postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status +postgres-configure-v14: $(BUILD_DIR)/v14/config.status # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include .PHONY: postgres-headers-% postgres-headers-%: postgres-configure-% +@echo "Installing PostgreSQL $* headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install + $(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install # Compile and install PostgreSQL .PHONY: postgres-% postgres-%: postgres-configure-% \ postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` +@echo "Compiling PostgreSQL $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install +@echo "Compiling libpq $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install + $(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install +@echo "Compiling pg_prewarm $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install +@echo "Compiling pg_visibility $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install +@echo "Compiling pg_trgm $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install +@echo "Compiling amcheck $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install +@echo "Compiling test_decoding $*" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install .PHONY: postgres-check-% postgres-check-%: postgres-% - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% - +@echo "Compiling neon $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$* + +@echo "Compiling neon-specific Postgres extensions for $*" + mkdir -p $(BUILD_DIR)/pgxn-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install - +@echo "Compiling neon_walredo $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install - +@echo "Compiling neon_rmgr $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install - +@echo "Compiling neon_test_utils $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install - +@echo "Compiling neon_utils $*" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install + -C $(BUILD_DIR)/pgxn-$*\ + -f $(ROOT_PROJECT_DIR)/pgxn/Makefile install # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. @@ -211,15 +200,15 @@ neon-pg-ext-%: postgres-% .PHONY: walproposer-lib walproposer-lib: neon-pg-ext-v17 +@echo "Compiling walproposer-lib" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib + mkdir -p $(BUILD_DIR)/walproposer-lib $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ - -C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \ + -C $(BUILD_DIR)/walproposer-lib \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib - cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib - cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \ + cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(BUILD_DIR)/walproposer-lib + cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(BUILD_DIR)/walproposer-lib + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgport.a \ pg_strong_random.o - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \ + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ checksum_helper.o \ cryptohash_openssl.o \ hmac_openssl.o \ @@ -227,7 +216,7 @@ walproposer-lib: neon-pg-ext-v17 parse_manifest.o \ scram-common.o ifeq ($(UNAME_S),Linux) - $(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \ + $(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \ pg_crc32c.o endif @@ -272,7 +261,7 @@ fmt: postgres-%-pg-bsd-indent: postgres-% +@echo "Compiling pg_bsd_indent" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/ + $(MAKE) -C $(BUILD_DIR)/$*/src/tools/pg_bsd_indent/ # Create typedef list for the core. Note that generally it should be combined with # buildfarm one to cover platform specific stuff. @@ -291,7 +280,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\ cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list +@echo note: you might want to run it on selected files/dirs instead. - INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ + INDENT=$(BUILD_DIR)/$*/src/tools/pg_bsd_indent/pg_bsd_indent \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \ $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \ --excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns @@ -302,9 +291,9 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \ FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \ - INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ + INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \ PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \ - -C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \ + -C $(BUILD_DIR)/neon-v17 \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 685ac564b7..7cd152f614 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -77,9 +77,6 @@ # build_and_test.yml github workflow for how that's done. ARG PG_VERSION -ARG REPOSITORY=ghcr.io/neondatabase -ARG IMAGE=build-tools -ARG TAG=pinned ARG BUILD_TAG ARG DEBIAN_VERSION=bookworm ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim @@ -150,6 +147,7 @@ RUN case $DEBIAN_VERSION in \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \ libclang-dev \ + jsonnet \ $VERSION_INSTALLS \ && apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home @@ -1634,18 +1632,7 @@ FROM pg-build AS neon-ext-build ARG PG_VERSION COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_utils \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_test_utils \ - -s install && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - -C pgxn/neon_rmgr \ - -s install +RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute ######################################################################################### # @@ -1735,7 +1722,7 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries # ######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools +FROM build-deps-with-cargo AS compute-tools ARG BUILD_TAG ENV BUILD_TAG=$BUILD_TAG @@ -1745,7 +1732,7 @@ COPY --chown=nonroot . . RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \ --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \ --mount=type=cache,uid=1000,target=/home/nonroot/target \ - mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ + cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \ mkdir target-bin && \ cp target/release-line-debug-size-lto/compute_ctl \ target/release-line-debug-size-lto/fast_import \ @@ -1839,10 +1826,11 @@ RUN rm /usr/local/pgsql/lib/lib*.a # Preprocess the sql_exporter configuration files # ######################################################################################### -FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +FROM build-deps AS sql_exporter_preprocessor ARG PG_VERSION USER nonroot +WORKDIR /home/nonroot COPY --chown=nonroot compute compute diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7a7f2dfedc..684d841897 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -408,7 +408,9 @@ impl ComputeNode { // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; let options = match conn_conf.get_options() { - Some(options) => format!("{} {}", options, EXTRA_OPTIONS), + // Allow the control plane to override any options set by the + // compute + Some(options) => format!("{} {}", EXTRA_OPTIONS, options), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 47b77f0720..387fc297f0 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -209,6 +209,8 @@ pub struct NeonStorageControllerConf { pub use_https_safekeeper_api: bool, pub use_local_compute_notifications: bool, + + pub timeline_safekeeper_count: Option, } impl NeonStorageControllerConf { @@ -236,9 +238,10 @@ impl Default for NeonStorageControllerConf { heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, long_reconcile_threshold: None, use_https_pageserver_api: false, - timelines_onto_safekeepers: false, + timelines_onto_safekeepers: true, use_https_safekeeper_api: false, use_local_compute_notifications: true, + timeline_safekeeper_count: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 755d67a7ad..95f7533057 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -628,6 +628,10 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } + if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); + } + println!("Starting storage controller"); background_process::start_process( diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index bbab271474..745c44c05b 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -16,6 +16,7 @@ pub static COMPUTE_AUDIENCE: &str = "compute"; pub enum ComputeClaimsScope { /// An admin-scoped token allows access to all of `compute_ctl`'s authorized /// facilities. + #[serde(rename = "compute_ctl:admin")] Admin, } @@ -24,7 +25,7 @@ impl FromStr for ComputeClaimsScope { fn from_str(s: &str) -> Result { match s { - "admin" => Ok(ComputeClaimsScope::Admin), + "compute_ctl:admin" => Ok(ComputeClaimsScope::Admin), _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")), } } @@ -80,3 +81,23 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } + +#[cfg(test)] +mod test { + use std::str::FromStr; + + use crate::requests::ComputeClaimsScope; + + /// Confirm that whether we parse the scope by string or through serde, the + /// same values parse to the same enum variant. + #[test] + fn compute_request_scopes() { + const ADMIN_SCOPE: &str = "compute_ctl:admin"; + + let from_serde: ComputeClaimsScope = + serde_json::from_str(&format!("\"{ADMIN_SCOPE}\"")).unwrap(); + let from_str = ComputeClaimsScope::from_str(ADMIN_SCOPE).unwrap(); + + assert_eq!(from_serde, from_str); + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2d7a06a72f..cfb1190a27 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -76,6 +76,10 @@ pub struct PostHogConfig { pub private_api_url: String, /// Public API URL pub public_api_url: String, + /// Refresh interval for the feature flag spec + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] + pub refresh_interval: Option, } /// `pageserver.toml` @@ -816,7 +820,7 @@ pub mod tenant_conf_defaults { // By default ingest enough WAL for two new L0 layers before checking if new image // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true; pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index e2de02eea0..07cada2eb1 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -23,22 +23,12 @@ pub struct ReAttachRequest { pub register: Option, } -fn default_mode() -> LocationConfigMode { - LocationConfigMode::AttachedSingle -} - #[derive(Serialize, Deserialize, Debug)] pub struct ReAttachResponseTenant { pub id: TenantShardId, /// Mandatory if LocationConfigMode is None or set to an Attached* mode pub r#gen: Option, - - /// Default value only for backward compat: this field should be set - #[serde(default = "default_mode")] pub mode: LocationConfigMode, - - // Default value only for backward compat: this field should be set - #[serde(default = "ShardStripeSize::default")] pub stripe_size: ShardStripeSize, } #[derive(Serialize, Deserialize)] diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index 693d62efc4..dc813ccb4a 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -36,7 +36,10 @@ impl FeatureResolverBackgroundLoop { // Main loop of updating the feature flags. handle.spawn( async move { - tracing::info!("Starting PostHog feature resolver"); + tracing::info!( + "Starting PostHog feature resolver with refresh period: {:?}", + refresh_period + ); let mut ticker = tokio::time::interval(refresh_period); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); loop { diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs index 4c2a5ef50f..94fbf333ed 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_query.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -1,5 +1,3 @@ -use std::io; - use tokio::net::TcpStream; use crate::client::SocketConfig; @@ -8,7 +6,7 @@ use crate::tls::MakeTlsConnect; use crate::{Error, cancel_query_raw, connect_socket}; pub(crate) async fn cancel_query( - config: Option, + config: SocketConfig, ssl_mode: SslMode, tls: T, process_id: i32, @@ -17,16 +15,6 @@ pub(crate) async fn cancel_query( where T: MakeTlsConnect, { - let config = match config { - Some(config) => config, - None => { - return Err(Error::connect(io::Error::new( - io::ErrorKind::InvalidInput, - "unknown host", - ))); - } - }; - let hostname = match &config.host { Host::Tcp(host) => &**host, }; diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index f6526395ee..c5566b4ad9 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -7,11 +7,16 @@ use crate::config::SslMode; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Error, cancel_query, cancel_query_raw}; -/// The capability to request cancellation of in-progress queries on a -/// connection. -#[derive(Clone, Serialize, Deserialize)] +/// A cancellation token that allows easy cancellation of a query. +#[derive(Clone)] pub struct CancelToken { - pub socket_config: Option, + pub socket_config: SocketConfig, + pub raw: RawCancelToken, +} + +/// A raw cancellation token that allows cancellation of a query, given a fresh connection to postgres. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RawCancelToken { pub ssl_mode: SslMode, pub process_id: i32, pub secret_key: i32, @@ -36,14 +41,16 @@ impl CancelToken { { cancel_query::cancel_query( self.socket_config.clone(), - self.ssl_mode, + self.raw.ssl_mode, tls, - self.process_id, - self.secret_key, + self.raw.process_id, + self.raw.secret_key, ) .await } +} +impl RawCancelToken { /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new /// connection itself. pub async fn cancel_query_raw(&self, stream: S, tls: T) -> Result<(), Error> diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index a7edfc076a..41b22e35b6 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -12,6 +12,7 @@ use postgres_protocol2::message::frontend; use serde::{Deserialize, Serialize}; use tokio::sync::mpsc; +use crate::cancel_token::RawCancelToken; use crate::codec::{BackendMessages, FrontendMessage}; use crate::config::{Host, SslMode}; use crate::query::RowStream; @@ -331,10 +332,12 @@ impl Client { /// connection associated with this client. pub fn cancel_token(&self) -> CancelToken { CancelToken { - socket_config: Some(self.socket_config.clone()), - ssl_mode: self.ssl_mode, - process_id: self.process_id, - secret_key: self.secret_key, + socket_config: self.socket_config.clone(), + raw: RawCancelToken { + ssl_mode: self.ssl_mode, + process_id: self.process_id, + secret_key: self.secret_key, + }, } } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 9556070ed5..791c93b972 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -3,7 +3,7 @@ use postgres_protocol2::message::backend::ReadyForQueryBody; -pub use crate::cancel_token::CancelToken; +pub use crate::cancel_token::{CancelToken, RawCancelToken}; pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; pub use crate::connect_raw::RawConnection; diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 530ceb1327..b13c8b32b4 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -13,22 +13,24 @@ fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); + let root_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + // Finding the location of built libraries and Postgres C headers: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { postgres_install_dir.into() } else { - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install") + root_path.join("pg_install") }; let pg_install_abs = std::fs::canonicalize(pg_install_dir)?; - let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib"); + let walproposer_lib_dir = root_path.join("build/walproposer-lib"); let walproposer_lib_search_str = walproposer_lib_dir .to_str() .ok_or(anyhow!("Bad non-UTF path"))?; - let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon"); + let pgxn_neon = root_path.join("pgxn/neon"); let pgxn_neon = std::fs::canonicalize(pgxn_neon)?; let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 606ba9ad8c..8a2e2ed3be 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,9 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", " fuzz-read-path = ["testing"] +# Enables benchmarking only APIs +benchmarking = [] + [dependencies] anyhow.workspace = true arc-swap.workspace = true @@ -127,6 +130,7 @@ harness = false [[bench]] name = "bench_ingest" harness = false +required-features = ["benchmarking"] [[bench]] name = "upload_queue" diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 681d135e09..438c6e235e 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -1,22 +1,29 @@ use std::env; use std::num::NonZeroUsize; +use std::sync::Arc; use bytes::Bytes; use camino::Utf8PathBuf; use criterion::{Criterion, criterion_group, criterion_main}; +use futures::stream::FuturesUnordered; use pageserver::config::PageServerConf; use pageserver::context::{DownloadBehavior, RequestContext}; +use pageserver::keyspace::KeySpace; use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState}; use pageserver::task_mgr::TaskKind; -use pageserver::tenant::storage_layer::InMemoryLayer; +use pageserver::tenant::storage_layer::IoConcurrency; +use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState}; use pageserver::{page_cache, virtual_file}; +use pageserver_api::config::GetVectoredConcurrentIo; use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; -use strum::IntoEnumIterator; +use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; +use utils::sync::gate::Gate; use wal_decoder::models::value::Value; use wal_decoder::serialized_batch::SerializedValueBatch; @@ -30,7 +37,7 @@ fn murmurhash32(mut h: u32) -> u32 { h } -#[derive(serde::Serialize, Clone, Copy, Debug)] +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum KeyLayout { /// Sequential unique keys Sequential, @@ -40,19 +47,30 @@ enum KeyLayout { RandomReuse(u32), } -#[derive(serde::Serialize, Clone, Copy, Debug)] +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] enum WriteDelta { Yes, No, } +#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)] +enum ConcurrentReads { + Yes, + No, +} + async fn ingest( conf: &'static PageServerConf, put_size: usize, put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, ) -> anyhow::Result<()> { + if concurrent_reads == ConcurrentReads::Yes { + assert_eq!(key_layout, KeyLayout::Sequential); + } + let mut lsn = utils::lsn::Lsn(1000); let mut key = Key::from_i128(0x0); @@ -68,16 +86,18 @@ async fn ingest( let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - lsn, - &gate, - &cancel, - &ctx, - ) - .await?; + let layer = Arc::new( + InMemoryLayer::create( + conf, + timeline_id, + tenant_shard_id, + lsn, + &gate, + &cancel, + &ctx, + ) + .await?, + ); let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; @@ -86,6 +106,61 @@ async fn ingest( pageserver::context::DownloadBehavior::Download, ); + const READ_BATCH_SIZE: u32 = 32; + let (tx, mut rx) = tokio::sync::watch::channel::>(None); + let reader_cancel = CancellationToken::new(); + let reader_handle = if concurrent_reads == ConcurrentReads::Yes { + Some(tokio::task::spawn({ + let cancel = reader_cancel.clone(); + let layer = layer.clone(); + let ctx = ctx.attached_child(); + async move { + let gate = Gate::default(); + let gate_guard = gate.enter().unwrap(); + let io_concurrency = IoConcurrency::spawn_from_conf( + GetVectoredConcurrentIo::SidecarTask, + gate_guard, + ); + + rx.wait_for(|key| key.is_some()).await.unwrap(); + + while !cancel.is_cancelled() { + let key = match *rx.borrow() { + Some(some) => some, + None => unreachable!(), + }; + + let mut start_key = key; + start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE); + let key_range = start_key..key.next(); + + let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + + layer + .get_values_reconstruct_data( + KeySpace::single(key_range), + Lsn(1)..Lsn(u64::MAX), + &mut reconstruct_state, + &ctx, + ) + .await + .unwrap(); + + let mut collect_futs = std::mem::take(&mut reconstruct_state.keys) + .into_values() + .map(|state| state.sink_pending_ios()) + .collect::>(); + while collect_futs.next().await.is_some() {} + } + + drop(io_concurrency); + gate.close().await; + } + })) + } else { + None + }; + const BATCH_SIZE: usize = 16; let mut batch = Vec::new(); @@ -113,19 +188,27 @@ async fn ingest( batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); if batch.len() >= BATCH_SIZE { + let last_key = Key::from_compact(batch.last().unwrap().0); + let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; + + tx.send(Some(last_key)).unwrap(); } } if !batch.is_empty() { + let last_key = Key::from_compact(batch.last().unwrap().0); + let this_batch = std::mem::take(&mut batch); let serialized = SerializedValueBatch::from_values(this_batch); layer.put_batch(serialized, &ctx).await?; + + tx.send(Some(last_key)).unwrap(); } layer.freeze(lsn + 1).await; - if matches!(write_delta, WriteDelta::Yes) { + if write_delta == WriteDelta::Yes { let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct { max_concurrency: NonZeroUsize::new(1).unwrap(), }); @@ -136,6 +219,11 @@ async fn ingest( tokio::fs::remove_file(path).await?; } + reader_cancel.cancel(); + if let Some(handle) = reader_handle { + handle.await.unwrap(); + } + Ok(()) } @@ -147,6 +235,7 @@ fn ingest_main( put_count: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, ) { pageserver::virtual_file::set_io_mode(io_mode); @@ -156,7 +245,15 @@ fn ingest_main( .unwrap(); runtime.block_on(async move { - let r = ingest(conf, put_size, put_count, key_layout, write_delta).await; + let r = ingest( + conf, + put_size, + put_count, + key_layout, + write_delta, + concurrent_reads, + ) + .await; if let Err(e) = r { panic!("{e:?}"); } @@ -195,6 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) { key_size: usize, key_layout: KeyLayout, write_delta: WriteDelta, + concurrent_reads: ConcurrentReads, } #[derive(Clone)] struct HandPickedParameters { @@ -245,7 +343,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in IoMode::iter() { + for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] { for param in expect.clone() { let HandPickedParameters { volume_mib, @@ -253,12 +351,18 @@ fn criterion_benchmark(c: &mut Criterion) { key_layout, write_delta, } = param; + + if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes { + continue; + } + out.push(ExplodedParameters { - io_mode, + io_mode: IoMode::DirectRw, volume_mib, key_size, key_layout, write_delta, + concurrent_reads, }); } } @@ -272,9 +376,10 @@ fn criterion_benchmark(c: &mut Criterion) { key_size, key_layout, write_delta, + concurrent_reads, } = self; format!( - "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}" + "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}" ) } } @@ -287,12 +392,23 @@ fn criterion_benchmark(c: &mut Criterion) { key_size, key_layout, write_delta, + concurrent_reads, } = params; let put_count = volume_mib * 1024 * 1024 / key_size; group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64)); group.sample_size(10); group.bench_function(id, |b| { - b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta)) + b.iter(|| { + ingest_main( + conf, + io_mode, + key_size, + put_count, + key_layout, + write_delta, + concurrent_reads, + ) + }) }); } } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 8b13b9e1db..c5283c2b09 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -11,6 +11,8 @@ futures.workspace = true pageserver_api.workspace = true postgres_ffi.workspace = true prost.workspace = true +strum.workspace = true +strum_macros.workspace = true thiserror.workspace = true tokio.workspace = true tonic.workspace = true diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 274f036f3d..aa4774c056 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -121,7 +121,7 @@ impl Client { pub async fn get_base_backup( &mut self, req: model::GetBaseBackupRequest, - ) -> Result>, tonic::Status> { + ) -> Result> + 'static, tonic::Status> { let proto_req = proto::GetBaseBackupRequest::from(req); let response_stream: Streaming = diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index ef7f89473f..6efa742799 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -459,7 +459,7 @@ impl GetPageResponse { /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream /// (potentially shared by many backends), and a gRPC status response would terminate the stream so /// we send GetPageResponse messages with these codes instead. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, PartialEq, strum_macros::Display)] pub enum GetPageStatusCode { /// Unknown status. For forwards compatibility: used when an older client version receives a new /// status code from a newer server version. diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 5e4af88e69..f5dfc0db25 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -25,6 +25,7 @@ tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tonic.workspace = true +url.workspace = true pageserver_client.workspace = true pageserver_api.workspace = true diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 43ad92980c..e028174c1d 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,20 +1,29 @@ use std::collections::HashMap; use std::num::NonZeroUsize; use std::ops::Range; -use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::pin::Pin; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Instant; -use anyhow::Context; +use anyhow::anyhow; +use futures::TryStreamExt as _; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; +use pageserver_page_api as page_api; use rand::prelude::*; +use tokio::io::AsyncRead; use tokio::sync::Barrier; use tokio::task::JoinSet; +use tokio_util::compat::{TokioAsyncReadCompatExt as _, TokioAsyncWriteCompatExt as _}; +use tokio_util::io::StreamReader; +use tonic::async_trait; use tracing::{info, instrument}; +use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; @@ -24,14 +33,15 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "postgres://postgres@localhost:64000")] + /// The Pageserver to connect to. Use postgresql:// for libpq, or grpc:// for gRPC. + #[clap(long, default_value = "postgresql://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] num_clients: NonZeroUsize, - #[clap(long, default_value = "1.0")] - gzip_probability: f64, + #[clap(long)] + no_compression: bool, #[clap(long)] runtime: Option, #[clap(long)] @@ -146,12 +156,27 @@ async fn main_impl( let mut work_senders = HashMap::new(); let mut tasks = Vec::new(); - for tl in &timelines { + let scheme = match Url::parse(&args.page_service_connstring) { + Ok(url) => url.scheme().to_lowercase().to_string(), + Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), + Err(err) => return Err(anyhow!("invalid connstring: {err}")), + }; + for &tl in &timelines { let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are work_senders.insert(tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, + + let client: Box = match scheme.as_str() { + "postgresql" | "postgres" => Box::new( + LibpqClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, + ), + "grpc" => Box::new( + GrpcClient::new(&args.page_service_connstring, tl, !args.no_compression).await?, + ), + scheme => return Err(anyhow!("invalid scheme {scheme}")), + }; + + tasks.push(tokio::spawn(run_worker( + client, Arc::clone(&start_work_barrier), receiver, Arc::clone(&all_work_done_barrier), @@ -166,13 +191,7 @@ async fn main_impl( let mut rng = rand::thread_rng(); let target = all_targets.choose(&mut rng).unwrap(); let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r)); - ( - target.timeline, - Work { - lsn, - gzip: rng.gen_bool(args.gzip_probability), - }, - ) + (target.timeline, Work { lsn }) }; let sender = work_senders.get(&timeline).unwrap(); // TODO: what if this blocks? @@ -216,13 +235,11 @@ async fn main_impl( #[derive(Copy, Clone)] struct Work { lsn: Option, - gzip: bool, } #[instrument(skip_all)] -async fn client( - args: &'static Args, - timeline: TenantTimelineId, +async fn run_worker( + mut client: Box, start_work_barrier: Arc, mut work: tokio::sync::mpsc::Receiver, all_work_done_barrier: Arc, @@ -230,37 +247,14 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) - .await - .unwrap(); - - while let Some(Work { lsn, gzip }) = work.recv().await { + while let Some(Work { lsn }) = work.recv().await { let start = Instant::now(); - let copy_out_stream = client - .basebackup(&BasebackupRequest { - tenant_id: timeline.tenant_id, - timeline_id: timeline.timeline_id, - lsn, - gzip, - }) - .await - .with_context(|| format!("start basebackup for {timeline}")) - .unwrap(); + let stream = client.basebackup(lsn).await.unwrap(); - use futures::StreamExt; - let size = Arc::new(AtomicUsize::new(0)); - copy_out_stream - .for_each({ - |r| { - let size = Arc::clone(&size); - async move { - let size = Arc::clone(&size); - size.fetch_add(r.unwrap().len(), Ordering::Relaxed); - } - } - }) - .await; - info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + let size = futures::io::copy(stream.compat(), &mut tokio::io::sink().compat_write()) + .await + .unwrap(); + info!("basebackup size is {size} bytes"); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { @@ -270,3 +264,94 @@ async fn client( all_work_done_barrier.wait().await; } + +/// A basebackup client. This allows switching out the client protocol implementation. +#[async_trait] +trait Client: Send { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>>; +} + +/// A libpq-based Pageserver client. +struct LibpqClient { + inner: pageserver_client::page_service::Client, + ttid: TenantTimelineId, + compression: bool, +} + +impl LibpqClient { + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + Ok(Self { + inner: pageserver_client::page_service::Client::new(connstring.to_string()).await?, + ttid, + compression, + }) + } +} + +#[async_trait] +impl Client for LibpqClient { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>> { + let req = BasebackupRequest { + tenant_id: self.ttid.tenant_id, + timeline_id: self.ttid.timeline_id, + lsn, + gzip: self.compression, + }; + let stream = self.inner.basebackup(&req).await?; + Ok(Box::pin(StreamReader::new( + stream.map_err(std::io::Error::other), + ))) + } +} + +/// A gRPC Pageserver client. +struct GrpcClient { + inner: page_api::Client, +} + +impl GrpcClient { + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + let inner = page_api::Client::new( + connstring.to_string(), + ttid.tenant_id, + ttid.timeline_id, + ShardIndex::unsharded(), + None, + compression.then_some(tonic::codec::CompressionEncoding::Zstd), + ) + .await?; + Ok(Self { inner }) + } +} + +#[async_trait] +impl Client for GrpcClient { + async fn basebackup( + &mut self, + lsn: Option, + ) -> anyhow::Result>> { + let req = page_api::GetBaseBackupRequest { + lsn, + replica: false, + full: false, + }; + let stream = self.inner.get_base_backup(req).await?; + Ok(Box::pin(StreamReader::new( + stream.map_err(std::io::Error::other), + ))) + } +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 3a68a77279..a297819e9b 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -10,33 +10,31 @@ use anyhow::Context; use async_trait::async_trait; use bytes::Bytes; use camino::Utf8PathBuf; +use futures::{Stream, StreamExt as _}; use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; -use pageserver_page_api::proto; +use pageserver_page_api as page_api; use rand::prelude::*; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; +use url::Url; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use crate::util::tokio_thread_local_stats::AllThreadLocalStats; use crate::util::{request_stats, tokio_thread_local_stats}; -#[derive(clap::ValueEnum, Clone, Debug)] -enum Protocol { - Libpq, - Grpc, -} - /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace. #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, + /// Pageserver connection string. Supports postgresql:// and grpc:// protocols. #[clap(long, default_value = "postgres://postgres@localhost:64000")] page_service_connstring: String, #[clap(long)] @@ -45,8 +43,9 @@ pub(crate) struct Args { num_clients: NonZeroUsize, #[clap(long)] runtime: Option, - #[clap(long, value_enum, default_value = "libpq")] - protocol: Protocol, + /// If true, enable compression (only for gRPC). + #[clap(long)] + compression: bool, /// Each client sends requests at the given rate. /// /// If a request takes too long and we should be issuing a new request already, @@ -325,18 +324,32 @@ async fn main_impl( .unwrap(); Box::pin(async move { - let client: Box = match args.protocol { - Protocol::Libpq => Box::new( - LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline) - .await - .unwrap(), + let scheme = match Url::parse(&args.page_service_connstring) { + Ok(url) => url.scheme().to_lowercase().to_string(), + Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), + Err(err) => panic!("invalid connstring: {err}"), + }; + let client: Box = match scheme.as_str() { + "postgresql" | "postgres" => { + assert!(!args.compression, "libpq does not support compression"); + Box::new( + LibpqClient::new(&args.page_service_connstring, worker_id.timeline) + .await + .unwrap(), + ) + } + + "grpc" => Box::new( + GrpcClient::new( + &args.page_service_connstring, + worker_id.timeline, + args.compression, + ) + .await + .unwrap(), ), - Protocol::Grpc => Box::new( - GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline) - .await - .unwrap(), - ), + scheme => panic!("unsupported scheme {scheme}"), }; run_worker(args, client, ss, cancel, rps_period, ranges, weights).await }) @@ -543,8 +556,8 @@ struct LibpqClient { } impl LibpqClient { - async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result { - let inner = pageserver_client::page_service::Client::new(connstring) + async fn new(connstring: &str, ttid: TenantTimelineId) -> anyhow::Result { + let inner = pageserver_client::page_service::Client::new(connstring.to_string()) .await? .pagestream(ttid.tenant_id, ttid.timeline_id) .await?; @@ -600,34 +613,36 @@ impl Client for LibpqClient { } } -/// A gRPC client using the raw, no-frills gRPC client. +/// A gRPC Pageserver client. struct GrpcClient { - req_tx: tokio::sync::mpsc::Sender, - resp_rx: tonic::Streaming, + req_tx: tokio::sync::mpsc::Sender, + resp_rx: Pin> + Send>>, } impl GrpcClient { - async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result { - let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?; + async fn new( + connstring: &str, + ttid: TenantTimelineId, + compression: bool, + ) -> anyhow::Result { + let mut client = page_api::Client::new( + connstring.to_string(), + ttid.tenant_id, + ttid.timeline_id, + ShardIndex::unsharded(), + None, + compression.then_some(tonic::codec::CompressionEncoding::Zstd), + ) + .await?; // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are // buffered by Tonic and the OS too. let (req_tx, req_rx) = tokio::sync::mpsc::channel(1); let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); - let mut req = tonic::Request::new(req_stream); - let metadata = req.metadata_mut(); - metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?); - metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?); - metadata.insert("neon-shard-id", "0000".try_into()?); + let resp_rx = Box::pin(client.get_pages(req_stream).await?); - let resp = client.get_pages(req).await?; - let resp_stream = resp.into_inner(); - - Ok(Self { - req_tx, - resp_rx: resp_stream, - }) + Ok(Self { req_tx, resp_rx }) } } @@ -641,27 +656,27 @@ impl Client for GrpcClient { rel: RelTag, blks: Vec, ) -> anyhow::Result<()> { - let req = proto::GetPageRequest { + let req = page_api::GetPageRequest { request_id: req_id, - request_class: proto::GetPageClass::Normal as i32, - read_lsn: Some(proto::ReadLsn { - request_lsn: req_lsn.0, - not_modified_since_lsn: mod_lsn.0, - }), - rel: Some(rel.into()), - block_number: blks, + request_class: page_api::GetPageClass::Normal, + read_lsn: page_api::ReadLsn { + request_lsn: req_lsn, + not_modified_since_lsn: Some(mod_lsn), + }, + rel, + block_numbers: blks, }; self.req_tx.send(req).await?; Ok(()) } async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec)> { - let resp = self.resp_rx.message().await?.unwrap(); + let resp = self.resp_rx.next().await.unwrap().unwrap(); anyhow::ensure!( - resp.status_code == proto::GetPageStatusCode::Ok as i32, + resp.status_code == page_api::GetPageStatusCode::Ok, "unexpected status code: {}", - resp.status_code + resp.status_code, ); - Ok((resp.request_id, resp.page_image)) + Ok((resp.request_id, resp.page_images)) } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 84edd68011..b4e6f78bf2 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -12,6 +12,8 @@ use utils::id::TenantId; use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; +const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); + #[derive(Clone)] pub struct FeatureResolver { inner: Option>, @@ -139,10 +141,13 @@ impl FeatureResolver { } tenants }; - // TODO: make refresh period configurable - inner - .clone() - .spawn(handle, Duration::from_secs(60), fake_tenants); + inner.clone().spawn( + handle, + posthog_config + .refresh_interval + .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL), + fake_tenants, + ); Ok(FeatureResolver { inner: Some(inner), internal_properties: Some(internal_properties), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index bf54614baa..8d6d342cf9 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3426,7 +3426,7 @@ impl TimelineMetrics { pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); - let size = layer.try_len().expect("frozen layer should have no writer"); + let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() @@ -3441,7 +3441,7 @@ impl TimelineMetrics { pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) { assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. })); let labels = self.make_frozen_layer_labels(layer); - let size = layer.try_len().expect("frozen layer should have no writer"); + let size = layer.len(); TIMELINE_LAYER_COUNT .get_metric_with_label_values(&labels) .unwrap() diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 642b447e5f..032db34983 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3544,8 +3544,9 @@ impl proto::PageService for GrpcPageServiceHandler { &self, req: tonic::Request, ) -> Result, tonic::Status> { - // Send 64 KB chunks to avoid large memory allocations. - const CHUNK_SIZE: usize = 64 * 1024; + // Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this + // to be the sweet spot where throughput is saturated. + const CHUNK_SIZE: usize = 256 * 1024; let timeline = self.get_request_timeline(&req).await?; let ctx = self.ctx.with_scope_timeline(&timeline); diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 030b43a020..c5087f7e0f 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -61,8 +61,10 @@ pub(crate) struct LocationConf { /// The detailed shard identity. This structure is already scoped within /// a TenantShardId, but we need the full ShardIdentity to enable calculating /// key->shard mappings. - // TODO(vlad): Remove this default once all configs have a shard identity on disk. - #[serde(default = "ShardIdentity::unsharded")] + /// + /// NB: we store this even for unsharded tenants, so that we agree with storcon on the intended + /// stripe size. Otherwise, a split request that does not specify a stripe size may use a + /// different default than storcon, which can lead to incorrect stripe sizes and corruption. pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 2edf22e9fd..203b5bf592 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -3,7 +3,7 @@ use std::io; use std::sync::Arc; -use std::sync::atomic::AtomicU64; +use std::sync::atomic::{AtomicU64, Ordering}; use camino::Utf8PathBuf; use num_traits::Num; @@ -18,6 +18,7 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; +use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; @@ -30,9 +31,13 @@ pub struct EphemeralFile { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, - bytes_written: u64, file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter, - buffered_writer: BufferedWriter, + + buffered_writer: tokio::sync::RwLock, + + bytes_written: AtomicU64, + + resource_units: std::sync::Mutex, } type BufferedWriter = owned_buffers_io::write::BufferedWriter< @@ -94,9 +99,8 @@ impl EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, page_cache_file_id, - bytes_written: 0, file: file.clone(), - buffered_writer: BufferedWriter::new( + buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new( file, 0, || IoBufferMut::with_capacity(TAIL_SZ), @@ -104,7 +108,9 @@ impl EphemeralFile { cancel.child_token(), ctx, info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename), - ), + )), + bytes_written: AtomicU64::new(0), + resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()), }) } } @@ -151,15 +157,17 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter #[derive(Debug, thiserror::Error)] pub(crate) enum EphemeralFileWriteError { - #[error("{0}")] - TooLong(String), #[error("cancelled")] Cancelled, } impl EphemeralFile { pub(crate) fn len(&self) -> u64 { - self.bytes_written + // TODO(vlad): The value returned here is not always correct if + // we have more than one concurrent writer. Writes are always + // sequenced, but we could grab the buffered writer lock if we wanted + // to. + self.bytes_written.load(Ordering::Acquire) } pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { @@ -186,7 +194,7 @@ impl EphemeralFile { /// Panics if the write is short because there's no way we can recover from that. /// TODO: make upstack handle this as an error. pub(crate) async fn write_raw( - &mut self, + &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result { @@ -198,22 +206,13 @@ impl EphemeralFile { } async fn write_raw_controlled( - &mut self, + &self, srcbuf: &[u8], ctx: &RequestContext, ) -> Result<(u64, Option), EphemeralFileWriteError> { - let pos = self.bytes_written; + let mut writer = self.buffered_writer.write().await; - let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { - EphemeralFileWriteError::TooLong(format!( - "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}", - srcbuf_len = srcbuf.len(), - )) - })?; - - // Write the payload - let (nwritten, control) = self - .buffered_writer + let (nwritten, control) = writer .write_buffered_borrowed_controlled(srcbuf, ctx) .await .map_err(|e| match e { @@ -225,43 +224,69 @@ impl EphemeralFile { "buffered writer has no short writes" ); - self.bytes_written = new_bytes_written; + // There's no realistic risk of overflow here. We won't have exabytes sized files on disk. + let pos = self + .bytes_written + .fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel); + + let mut resource_units = self.resource_units.lock().unwrap(); + resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed)); Ok((pos, control)) } + + pub(crate) fn tick(&self) -> Option { + let mut resource_units = self.resource_units.lock().unwrap(); + let len = self.bytes_written.load(Ordering::Relaxed); + resource_units.publish_size(len) + } } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { async fn read_exact_at_eof_ok( &self, start: u64, - dst: tokio_epoll_uring::Slice, + mut dst: tokio_epoll_uring::Slice, ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let submitted_offset = self.buffered_writer.bytes_submitted(); + // We will fill the slice in back to front. Hence, we need + // the slice to be fully initialized. + // TODO(vlad): Is there a nicer way of doing this? + dst.as_mut_rust_slice_full_zeroed(); - let mutable = match self.buffered_writer.inspect_mutable() { - Some(mutable) => &mutable[0..mutable.pending()], - None => { - // Timeline::cancel and hence buffered writer flush was cancelled. - // Remain read-available while timeline is shutting down. - &[] - } - }; + let writer = self.buffered_writer.read().await; - let maybe_flushed = self.buffered_writer.inspect_maybe_flushed(); + // Read bytes written while under lock. This is a hack to deal with concurrent + // writes updating the number of bytes written. `bytes_written` is not DIO alligned + // but we may end the read there. + // + // TODO(vlad): Feels like there's a nicer path where we align the end if it + // shoots over the end of the file. + let bytes_written = self.bytes_written.load(Ordering::Acquire); let dst_cap = dst.bytes_total().into_u64(); let end = { // saturating_add is correct here because the max file size is u64::MAX, so, // if start + dst.len() > u64::MAX, then we know it will be a short read let mut end: u64 = start.saturating_add(dst_cap); - if end > self.bytes_written { - end = self.bytes_written; + if end > bytes_written { + end = bytes_written; } end }; + let submitted_offset = writer.bytes_submitted(); + let maybe_flushed = writer.inspect_maybe_flushed(); + + let mutable = match writer.inspect_mutable() { + Some(mutable) => &mutable[0..mutable.pending()], + None => { + // Timeline::cancel and hence buffered writer flush was cancelled. + // Remain read-available while timeline is shutting down. + &[] + } + }; + // inclusive, exclusive #[derive(Debug)] struct Range(N, N); @@ -306,13 +331,33 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral let mutable_range = Range(std::cmp::max(start, submitted_offset), end); - let dst = if written_range.len() > 0 { + // There are three sources from which we might have to read data: + // 1. The file itself + // 2. The buffer which contains changes currently being flushed + // 3. The buffer which contains chnages yet to be flushed + // + // For better concurrency, we do them in reverse order: perform the in-memory + // reads while holding the writer lock, drop the writer lock and read from the + // file if required. + + let dst = if mutable_range.len() > 0 { + let offset_in_buffer = mutable_range + .0 + .checked_sub(submitted_offset) + .unwrap() + .into_usize(); + let to_copy = + &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; let bounds = dst.bounds(); - let slice = self - .file - .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) - .await?; - Slice::from_buf_bounds(Slice::into_inner(slice), bounds) + let mut view = dst.slice({ + let start = + written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); + let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) } else { dst }; @@ -342,24 +387,15 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; - let dst = if mutable_range.len() > 0 { - let offset_in_buffer = mutable_range - .0 - .checked_sub(submitted_offset) - .unwrap() - .into_usize(); - let to_copy = - &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; + drop(writer); + + let dst = if written_range.len() > 0 { let bounds = dst.bounds(); - let mut view = dst.slice({ - let start = - written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); - let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); - start..end - }); - view.as_mut_rust_slice_full_zeroed() - .copy_from_slice(to_copy); - Slice::from_buf_bounds(Slice::into_inner(view), bounds) + let slice = self + .file + .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) + .await?; + Slice::from_buf_bounds(Slice::into_inner(slice), bounds) } else { dst }; @@ -460,13 +496,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); - let mutable = file.buffered_writer.mutable(); + let writer = file.buffered_writer.read().await; + let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); + drop(writer); let write_nbytes = cap * 2 + cap / 2; @@ -504,10 +542,11 @@ mod tests { let file_contents = std::fs::read(file.file.path()).unwrap(); assert!(file_contents == content[0..cap * 2]); - let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); + let writer = file.buffered_writer.read().await; + let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap(); assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); - let mutable_buffer_contents = file.buffered_writer.mutable(); + let mutable_buffer_contents = writer.mutable(); assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } @@ -517,12 +556,14 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); // mutable buffer and maybe_flushed buffer each has `cap` bytes. - let cap = file.buffered_writer.mutable().capacity(); + let writer = file.buffered_writer.read().await; + let cap = writer.mutable().capacity(); + drop(writer); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -540,12 +581,13 @@ mod tests { 2 * cap.into_u64(), "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); + let writer = file.buffered_writer.read().await; assert_eq!( - &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap], + &writer.inspect_maybe_flushed().unwrap()[0..cap], &content[cap..cap * 2] ); assert_eq!( - &file.buffered_writer.mutable()[0..cap / 2], + &writer.mutable()[0..cap / 2], &content[cap * 2..cap * 2 + cap / 2] ); } @@ -563,13 +605,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); let cancel = CancellationToken::new(); - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx) .await .unwrap(); - let mutable = file.buffered_writer.mutable(); + let writer = file.buffered_writer.read().await; + let mutable = writer.mutable(); let cap = mutable.capacity(); let align = mutable.align(); + drop(writer); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) .take(cap * 2 + cap / 2) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e65d444f76..9fbb9d2438 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue { /// Reconstruct data accumulated for a single key during a vectored get #[derive(Debug, Default)] -pub(crate) struct VectoredValueReconstructState { +pub struct VectoredValueReconstructState { pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>, pub(crate) situation: ValueReconstructSituation, @@ -244,13 +244,60 @@ impl VectoredValueReconstructState { res } + + /// Benchmarking utility to await for the completion of all pending ios + /// + /// # Cancel-Safety + /// + /// Technically fine to stop polling this future, but, the IOs will still + /// be executed to completion by the sidecar task and hold on to / consume resources. + /// Better not do it to make reasonsing about the system easier. + #[cfg(feature = "benchmarking")] + pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> { + let mut res = Ok(()); + + // We should try hard not to bail early, so that by the time we return from this + // function, all IO for this value is done. It's not required -- we could totally + // stop polling the IO futures in the sidecar task, they need to support that, + // but just stopping to poll doesn't reduce the IO load on the disk. It's easier + // to reason about the system if we just wait for all IO to complete, even if + // we're no longer interested in the result. + // + // Revisit this when IO futures are replaced with a more sophisticated IO system + // and an IO scheduler, where we know which IOs were submitted and which ones + // just queued. Cf the comment on IoConcurrency::spawn_io. + for (_lsn, waiter) in self.on_disk_values { + let value_recv_res = waiter + .wait_completion() + // we rely on the caller to poll us to completion, so this is not a bail point + .await; + + match (&mut res, value_recv_res) { + (Err(_), _) => { + // We've already failed, no need to process more. + } + (Ok(_), Err(_wait_err)) => { + // This shouldn't happen - likely the sidecar task panicked. + unreachable!(); + } + (Ok(_), Ok(Err(err))) => { + let err: std::io::Error = err; + res = Err(err); + } + (Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {} + (Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {} + } + } + + res + } } /// Bag of data accumulated during a vectored get.. -pub(crate) struct ValuesReconstructState { +pub struct ValuesReconstructState { /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` /// should not expect to get anything from this hashmap. - pub(crate) keys: HashMap, + pub keys: HashMap, /// The keys which are already retrieved keys_done: KeySpaceRandomAccum, @@ -272,7 +319,7 @@ pub(crate) struct ValuesReconstructState { /// The desired end state is that we always do parallel IO. /// This struct and the dispatching in the impl will be removed once /// we've built enough confidence. -pub(crate) enum IoConcurrency { +pub enum IoConcurrency { Sequential, SidecarTask { task_id: usize, @@ -317,10 +364,7 @@ impl IoConcurrency { Self::spawn(SelectedIoConcurrency::Sequential) } - pub(crate) fn spawn_from_conf( - conf: GetVectoredConcurrentIo, - gate_guard: GateGuard, - ) -> IoConcurrency { + pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency { let selected = match conf { GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential, GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard), @@ -425,16 +469,6 @@ impl IoConcurrency { } } - pub(crate) fn clone(&self) -> Self { - match self { - IoConcurrency::Sequential => IoConcurrency::Sequential, - IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { - task_id: *task_id, - ios_tx: ios_tx.clone(), - }, - } - } - /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string. /// /// The IO is represented as an opaque future. @@ -573,6 +607,18 @@ impl IoConcurrency { } } +impl Clone for IoConcurrency { + fn clone(&self) -> Self { + match self { + IoConcurrency::Sequential => IoConcurrency::Sequential, + IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask { + task_id: *task_id, + ios_tx: ios_tx.clone(), + }, + } + } +} + /// Make noise in case the [`ValuesReconstructState`] gets dropped while /// there are still IOs in flight. /// Refer to `collect_pending_ios` for why we prefer not to do that. @@ -603,7 +649,7 @@ impl Drop for ValuesReconstructState { } impl ValuesReconstructState { - pub(crate) fn new(io_concurrency: IoConcurrency) -> Self { + pub fn new(io_concurrency: IoConcurrency) -> Self { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 200beba115..8e5b0ba648 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -70,23 +70,15 @@ pub struct InMemoryLayer { /// We use a separate lock for the index to reduce the critical section /// during which reads cannot be planned. /// - /// If you need access to both the index and the underlying file at the same time, - /// respect the following locking order to avoid deadlocks: - /// 1. [`InMemoryLayer::inner`] - /// 2. [`InMemoryLayer::index`] - /// - /// Note that the file backing [`InMemoryLayer::inner`] is append-only, - /// so it is not necessary to hold simultaneous locks on index. - /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency. + /// Note that the file backing [`InMemoryLayer::file`] is append-only, + /// so it is not necessary to hold a lock on the index while reading or writing from the file. /// In particular: - /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`]. - /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`]. + /// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`]. + /// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`]. index: RwLock>>, - /// The above fields never change, except for `end_lsn`, which is only set once, - /// and `index` (see rationale there). - /// All other changing parts are in `inner`, and protected by a mutex. - inner: RwLock, + /// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes. + file: EphemeralFile, estimated_in_mem_size: AtomicU64, } @@ -96,20 +88,10 @@ impl std::fmt::Debug for InMemoryLayer { f.debug_struct("InMemoryLayer") .field("start_lsn", &self.start_lsn) .field("end_lsn", &self.end_lsn) - .field("inner", &self.inner) .finish() } } -pub struct InMemoryLayerInner { - /// The values are stored in a serialized format in this file. - /// Each serialized Value is preceded by a 'u32' length field. - /// PerSeg::page_versions map stores offsets into this file. - file: EphemeralFile, - - resource_units: GlobalResourceUnits, -} - /// Support the same max blob length as blob_io, because ultimately /// all the InMemoryLayer contents end up being written into a delta layer, /// using the [`crate::tenant::blob_io`]. @@ -258,12 +240,6 @@ struct IndexEntryUnpacked { pos: u64, } -impl std::fmt::Debug for InMemoryLayerInner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("InMemoryLayerInner").finish() - } -} - /// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, /// to minimize contention. /// @@ -280,7 +256,7 @@ pub(crate) struct GlobalResources { } // Per-timeline RAII struct for its contribution to [`GlobalResources`] -struct GlobalResourceUnits { +pub(crate) struct GlobalResourceUnits { // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible // for decrementing the global counter by this many bytes when dropped. dirty_bytes: u64, @@ -292,7 +268,7 @@ impl GlobalResourceUnits { // updated when the Timeline "ticks" in the background. const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; - fn new() -> Self { + pub(crate) fn new() -> Self { GLOBAL_RESOURCES .dirty_layers .fetch_add(1, AtomicOrdering::Relaxed); @@ -304,7 +280,7 @@ impl GlobalResourceUnits { /// /// Returns the effective layer size limit that should be applied, if any, to keep /// the total number of dirty bytes below the configured maximum. - fn publish_size(&mut self, size: u64) -> Option { + pub(crate) fn publish_size(&mut self, size: u64) -> Option { let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), Ordering::Greater => { @@ -349,7 +325,7 @@ impl GlobalResourceUnits { // Call publish_size if the input size differs from last published size by more than // the drift limit - fn maybe_publish_size(&mut self, size: u64) { + pub(crate) fn maybe_publish_size(&mut self, size: u64) { let publish = match size.cmp(&self.dirty_bytes) { Ordering::Equal => false, Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, @@ -398,8 +374,8 @@ impl InMemoryLayer { } } - pub(crate) fn try_len(&self) -> Option { - self.inner.try_read().map(|i| i.file.len()).ok() + pub(crate) fn len(&self) -> u64 { + self.file.len() } pub(crate) fn assert_writable(&self) { @@ -430,7 +406,7 @@ impl InMemoryLayer { // Look up the keys in the provided keyspace and update // the reconstruct state with whatever is found. - pub(crate) async fn get_values_reconstruct_data( + pub async fn get_values_reconstruct_data( self: &Arc, keyspace: KeySpace, lsn_range: Range, @@ -479,14 +455,13 @@ impl InMemoryLayer { } } } - drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below + drop(index); // release the lock before we spawn the IO let read_from = Arc::clone(self); let read_ctx = ctx.attached_child(); reconstruct_state .spawn_io(async move { - let inner = read_from.inner.read().await; let f = vectored_dio_read::execute( - &inner.file, + &read_from.file, reads .iter() .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), @@ -518,7 +493,6 @@ impl InMemoryLayer { // This is kinda forced for InMemoryLayer because we need to inner.read() anyway, // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit // drop for consistency among all three layer types. - drop(inner); drop(read_from); }) .await; @@ -549,12 +523,6 @@ impl std::fmt::Display for InMemoryLayer { } impl InMemoryLayer { - /// Get layer size. - pub async fn size(&self) -> Result { - let inner = self.inner.read().await; - Ok(inner.file.len()) - } - pub fn estimated_in_mem_size(&self) -> u64 { self.estimated_in_mem_size.load(AtomicOrdering::Relaxed) } @@ -587,10 +555,7 @@ impl InMemoryLayer { end_lsn: OnceLock::new(), opened_at: Instant::now(), index: RwLock::new(BTreeMap::new()), - inner: RwLock::new(InMemoryLayerInner { - file, - resource_units: GlobalResourceUnits::new(), - }), + file, estimated_in_mem_size: AtomicU64::new(0), }) } @@ -599,41 +564,37 @@ impl InMemoryLayer { /// /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. + /// + /// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`]. + /// /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. pub async fn put_batch( &self, serialized_batch: SerializedValueBatch, ctx: &RequestContext, ) -> anyhow::Result<()> { - let (base_offset, metadata) = { - let mut inner = self.inner.write().await; - self.assert_writable(); + self.assert_writable(); - let base_offset = inner.file.len(); + let base_offset = self.file.len(); - let SerializedValueBatch { - raw, - metadata, - max_lsn: _, - len: _, - } = serialized_batch; + let SerializedValueBatch { + raw, + metadata, + max_lsn: _, + len: _, + } = serialized_batch; - // Write the batch to the file - inner.file.write_raw(&raw, ctx).await?; - let new_size = inner.file.len(); + // Write the batch to the file + self.file.write_raw(&raw, ctx).await?; + let new_size = self.file.len(); - let expected_new_len = base_offset - .checked_add(raw.len().into_u64()) - // write_raw would error if we were to overflow u64. - // also IndexEntry and higher levels in - //the code don't allow the file to grow that large - .unwrap(); - assert_eq!(new_size, expected_new_len); - - inner.resource_units.maybe_publish_size(new_size); - - (base_offset, metadata) - }; + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); // Update the index with the new entries let mut index = self.index.write().await; @@ -686,10 +647,8 @@ impl InMemoryLayer { self.opened_at } - pub(crate) async fn tick(&self) -> Option { - let mut inner = self.inner.write().await; - let size = inner.file.len(); - inner.resource_units.publish_size(size) + pub(crate) fn tick(&self) -> Option { + self.file.tick() } pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { @@ -753,12 +712,6 @@ impl InMemoryLayer { gate: &utils::sync::gate::Gate, cancel: CancellationToken, ) -> Result> { - // Grab the lock in read-mode. We hold it over the I/O, but because this - // layer is not writeable anymore, no one should be trying to acquire the - // write lock on it, so we shouldn't block anyone. See the comment on - // [`InMemoryLayer::freeze`] to understand how locking between the append path - // and layer flushing works. - let inner = self.inner.read().await; let index = self.index.read().await; use l0_flush::Inner; @@ -793,7 +746,7 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { - let file_contents = inner.file.load_to_io_buf(ctx).await?; + let file_contents = self.file.load_to_io_buf(ctx).await?; let file_contents = file_contents.freeze(); for (key, vec_map) in index.iter() { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0f1e857241..4f47ca6faa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -816,7 +816,7 @@ impl From for FlushLayerError { } #[derive(thiserror::Error, Debug)] -pub(crate) enum GetVectoredError { +pub enum GetVectoredError { #[error("timeline shutting down")] Cancelled, @@ -849,7 +849,7 @@ impl From for GetVectoredError { } #[derive(thiserror::Error, Debug)] -pub(crate) enum GetReadyAncestorError { +pub enum GetReadyAncestorError { #[error("ancestor LSN wait error")] AncestorLsnTimeout(#[from] WaitLsnError), @@ -939,7 +939,7 @@ impl std::fmt::Debug for Timeline { } #[derive(thiserror::Error, Debug, Clone)] -pub(crate) enum WaitLsnError { +pub enum WaitLsnError { // Called on a timeline which is shutting down #[error("Shutdown")] Shutdown, @@ -1902,16 +1902,11 @@ impl Timeline { return; }; - let Some(current_size) = open_layer.try_len() else { - // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so - // read lock to get size should always succeed. - tracing::warn!("Lock conflict while reading size of open layer"); - return; - }; + let current_size = open_layer.len(); let current_lsn = self.get_last_record_lsn(); - let checkpoint_distance_override = open_layer.tick().await; + let checkpoint_distance_override = open_layer.tick(); if let Some(size_override) = checkpoint_distance_override { if current_size > size_override { @@ -6547,7 +6542,7 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); - let mut layers_to_remove = Vec::new(); + let max_retain_lsn = retain_lsns.iter().max(); // Scan all layers in the timeline (remote or on-disk). // @@ -6557,108 +6552,110 @@ impl Timeline { // 3. it doesn't need to be retained for 'retain_lsns'; // 4. it does not need to be kept for LSNs holding valid leases. // 5. newer on-disk image layers cover the layer's whole key range - // - // TODO holding a write lock is too agressive and avoidable - let mut guard = self - .layers - .write(LayerManagerLockHolder::GarbageCollection) - .await; - let layers = guard.layer_map()?; - 'outer: for l in layers.iter_historic_layers() { - result.layers_total += 1; + let layers_to_remove = { + let mut layers_to_remove = Vec::new(); - // 1. Is it newer than GC horizon cutoff point? - if l.get_lsn_range().end > space_cutoff { - info!( - "keeping {} because it's newer than space_cutoff {}", - l.layer_name(), - space_cutoff, - ); - result.layers_needed_by_cutoff += 1; - continue 'outer; - } + let guard = self + .layers + .read(LayerManagerLockHolder::GarbageCollection) + .await; + let layers = guard.layer_map()?; + 'outer: for l in layers.iter_historic_layers() { + result.layers_total += 1; - // 2. It is newer than PiTR cutoff point? - if l.get_lsn_range().end > time_cutoff { - info!( - "keeping {} because it's newer than time_cutoff {}", - l.layer_name(), - time_cutoff, - ); - result.layers_needed_by_pitr += 1; - continue 'outer; - } - - // 3. Is it needed by a child branch? - // NOTE With that we would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - // - // TODO Vec is not a great choice for `retain_lsns` - for retain_lsn in &retain_lsns { - // start_lsn is inclusive - if &l.get_lsn_range().start <= retain_lsn { - info!( - "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > space_cutoff { + debug!( + "keeping {} because it's newer than space_cutoff {}", l.layer_name(), - retain_lsn, - l.is_incremental(), + space_cutoff, ); - result.layers_needed_by_branches += 1; + result.layers_needed_by_cutoff += 1; continue 'outer; } - } - // 4. Is there a valid lease that requires us to keep this layer? - if let Some(lsn) = &max_lsn_with_valid_lease { - // keep if layer start <= any of the lease - if &l.get_lsn_range().start <= lsn { - info!( - "keeping {} because there is a valid lease preventing GC at {}", + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > time_cutoff { + debug!( + "keeping {} because it's newer than time_cutoff {}", l.layer_name(), - lsn, + time_cutoff, ); - result.layers_needed_by_leases += 1; + result.layers_needed_by_pitr += 1; continue 'outer; } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + if let Some(retain_lsn) = max_retain_lsn { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.layer_name(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a valid lease that requires us to keep this layer? + if let Some(lsn) = &max_lsn_with_valid_lease { + // keep if layer start <= any of the lease + if &l.get_lsn_range().start <= lsn { + debug!( + "keeping {} because there is a valid lease preventing GC at {}", + l.layer_name(), + lsn, + ); + result.layers_needed_by_leases += 1; + continue 'outer; + } + } + + // 5. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) + { + debug!("keeping {} because it is the latest layer", l.layer_name()); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + info!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.layer_name(), + l.is_incremental(), + ); + layers_to_remove.push(l); } - // 5. Is there a later on-disk layer for this relation? - // - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - // - // For example, imagine that the following layers exist: - // - // 1000 - image (A) - // 1000-2000 - delta (B) - // 2000 - image (C) - // 2000-3000 - delta (D) - // 3000 - image (E) - // - // If GC horizon is at 2500, we can remove layers A and B, but - // we cannot remove C, even though it's older than 2500, because - // the delta layer 2000-3000 depends on it. - if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) - { - info!("keeping {} because it is the latest layer", l.layer_name()); - result.layers_not_updated += 1; - continue 'outer; - } - - // We didn't find any reason to keep this file, so remove it. - info!( - "garbage collecting {} is_dropped: xx is_incremental: {}", - l.layer_name(), - l.is_incremental(), - ); - layers_to_remove.push(l); - } + layers_to_remove + }; if !layers_to_remove.is_empty() { // Persist the new GC cutoff value before we actually remove anything. @@ -6674,15 +6671,19 @@ impl Timeline { } })?; + let mut guard = self + .layers + .write(LayerManagerLockHolder::GarbageCollection) + .await; + let gc_layers = layers_to_remove .iter() - .map(|x| guard.get_from_desc(x)) + .flat_map(|desc| guard.try_get_from_key(&desc.key()).cloned()) .collect::>(); result.layers_removed = gc_layers.len() as u64; self.remote_client.schedule_gc_update(&gc_layers)?; - guard.open_mut()?.finish_gc_timeline(&gc_layers); #[cfg(feature = "testing")] @@ -7370,7 +7371,7 @@ impl TimelineWriter<'_> { .tl .get_layer_for_write(at, &self.write_guard, ctx) .await?; - let initial_size = layer.size().await?; + let initial_size = layer.len(); let last_freeze_at = self.last_freeze_at.load(); self.write_guard.replace(TimelineWriterState::new( diff --git a/pgxn/Makefile b/pgxn/Makefile new file mode 100644 index 0000000000..8f190668ea --- /dev/null +++ b/pgxn/Makefile @@ -0,0 +1,28 @@ +# This makefile assumes that 'pg_config' is in the path, or is passed in the +# PG_CONFIG variable. +# +# This is used in two different ways: +# +# 1. The main makefile calls this, when you invoke the `make neon-pg-ext-%` +# target. It passes PG_CONFIG pointing to pg_install/%/bin/pg_config. +# This is a VPATH build; the current directory is build/pgxn-%, and +# the path to the Makefile is passed with the -f argument. +# +# 2. compute-node.Dockerfile invokes this to build the compute extensions +# for the specific Postgres version. It relies on pg_config already +# being in $(PATH). + +srcdir = $(dir $(firstword $(MAKEFILE_LIST))) + +PG_CONFIG = pg_config + +subdirs = neon neon_rmgr neon_walredo neon_utils neon_test_utils + +.PHONY: install install-compute install-storage $(subdirs) +install: $(subdirs) +install-compute: neon neon_utils neon_test_utils neon_rmgr +install-storage: neon_rmgr neon_walredo + +$(subdirs): %: + mkdir -p $* + $(MAKE) PG_CONFIG=$(PG_CONFIG) -C $* -f $(abspath $(srcdir)/$@/Makefile) install diff --git a/poetry.lock b/poetry.lock index f9b6f83366..1bc5077eb7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -746,23 +746,23 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"] [[package]] name = "botocore" -version = "1.34.11" +version = "1.34.162" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.8" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"}, - {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"}, + {file = "botocore-1.34.162-py3-none-any.whl", hash = "sha256:2d918b02db88d27a75b48275e6fb2506e9adaaddbec1ffa6a8a0898b34e769be"}, + {file = "botocore-1.34.162.tar.gz", hash = "sha256:adc23be4fb99ad31961236342b7cbf3c0bfc62532cd02852196032e8c0d682f3"}, ] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" -urllib3 = {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""} +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} [package.extras] -crt = ["awscrt (==0.19.19)"] +crt = ["awscrt (==0.21.2)"] [[package]] name = "botocore-stubs" @@ -3422,20 +3422,21 @@ files = [ [[package]] name = "urllib3" -version = "1.26.19" +version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +python-versions = ">=3.9" groups = ["main"] files = [ - {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, - {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, + {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, + {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.extras] -brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] [[package]] name = "websockets" diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs new file mode 100644 index 0000000000..61bdf2b747 --- /dev/null +++ b/proxy/src/batch.rs @@ -0,0 +1,146 @@ +//! Batch processing system based on intrusive linked lists. +//! +//! Enqueuing a batch job requires no allocations, with +//! direct support for cancelling jobs early. +use std::collections::BTreeMap; +use std::pin::pin; +use std::sync::Mutex; + +use futures::future::Either; +use scopeguard::ScopeGuard; +use tokio::sync::oneshot::error::TryRecvError; + +use crate::ext::LockExt; + +pub trait QueueProcessing: Send + 'static { + type Req: Send + 'static; + type Res: Send; + + /// Get the desired batch size. + fn batch_size(&self, queue_size: usize) -> usize; + + /// This applies a full batch of events. + /// Must respond with a full batch of replies. + /// + /// If this apply can error, it's expected that errors be forwarded to each Self::Res. + /// + /// Batching does not need to happen atomically. + fn apply(&mut self, req: Vec) -> impl Future> + Send; +} + +pub struct BatchQueue { + processor: tokio::sync::Mutex

, + inner: Mutex>, +} + +struct BatchJob { + req: P::Req, + res: tokio::sync::oneshot::Sender, +} + +impl BatchQueue

{ + pub fn new(p: P) -> Self { + Self { + processor: tokio::sync::Mutex::new(p), + inner: Mutex::new(BatchQueueInner { + version: 0, + queue: BTreeMap::new(), + }), + } + } + + pub async fn call(&self, req: P::Req) -> P::Res { + let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); + let guard = scopeguard::guard(id, move |id| { + let mut inner = self.inner.lock_propagate_poison(); + if inner.queue.remove(&id).is_some() { + tracing::debug!("batched task cancelled before completion"); + } + }); + + let resp = loop { + // try become the leader, or try wait for success. + let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await + { + // we got the resp. + Either::Left((resp, _)) => break resp.ok(), + // we are the leader. + Either::Right((p, rx_)) => { + rx = rx_; + p + } + }; + + let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor); + + // apply a batch. + let values = processor.apply(reqs).await; + + // send response values. + for (tx, value) in std::iter::zip(resps, values) { + // sender hung up but that's fine. + drop(tx.send(value)); + } + + match rx.try_recv() { + Ok(resp) => break Some(resp), + Err(TryRecvError::Closed) => break None, + // edge case - there was a race condition where + // we became the leader but were not in the batch. + // + // Example: + // thread 1: register job id=1 + // thread 2: register job id=2 + // thread 2: processor.lock().await + // thread 1: processor.lock().await + // thread 2: becomes leader, batch_size=1, jobs=[1]. + Err(TryRecvError::Empty) => {} + } + }; + + // already removed. + ScopeGuard::into_inner(guard); + + resp.expect("no response found. batch processer should not panic") + } +} + +struct BatchQueueInner { + version: u64, + queue: BTreeMap>, +} + +impl BatchQueueInner

{ + fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver) { + let (tx, rx) = tokio::sync::oneshot::channel(); + + let id = self.version; + + // Overflow concern: + // This is a u64, and we might enqueue 2^16 tasks per second. + // This gives us 2^48 seconds (9 million years). + // Even if this does overflow, it will not break, but some + // jobs with the higher version might never get prioritised. + self.version += 1; + + self.queue.insert(id, BatchJob { req, res: tx }); + + (id, rx) + } + + fn get_batch(&mut self, p: &P) -> (Vec, Vec>) { + let batch_size = p.batch_size(self.queue.len()); + let mut reqs = Vec::with_capacity(batch_size); + let mut resps = Vec::with_capacity(batch_size); + + while reqs.len() < batch_size { + let Some((_, job)) = self.queue.pop_first() else { + break; + }; + reqs.push(job.req); + resps.push(job.res); + } + + (reqs, resps) + } +} diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ba10fce7b4..e3be454713 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -201,7 +201,7 @@ pub async fn run() -> anyhow::Result<()> { auth_backend, http_listener, shutdown.clone(), - Arc::new(CancellationHandler::new(&config.connect_to_compute, None)), + Arc::new(CancellationHandler::new(&config.connect_to_compute)), endpoint_rate_limiter, ); diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 6ab6df5610..9215dbf73f 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -23,7 +23,8 @@ use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; -use crate::cancellation::{CancellationHandler, handle_cancel_messages}; +use crate::batch::BatchQueue; +use crate::cancellation::{CancellationHandler, CancellationProcessor}; use crate::config::{ self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, @@ -392,13 +393,7 @@ pub async fn run() -> anyhow::Result<()> { .as_ref() .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - // channel size should be higher than redis client limit to avoid blocking - let cancel_ch_size = args.cancellation_ch_size; - let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size); - let cancellation_handler = Arc::new(CancellationHandler::new( - &config.connect_to_compute, - Some(tx_cancel), - )); + let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute)); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit) @@ -530,21 +525,11 @@ pub async fn run() -> anyhow::Result<()> { match redis_kv_client.try_connect().await { Ok(()) => { info!("Connected to Redis KV client"); - maintenance_tasks.spawn(async move { - handle_cancel_messages( - &mut redis_kv_client, - rx_cancel, - args.cancellation_batch_size, - ) - .await?; + cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor { + client: redis_kv_client, + batch_size: args.cancellation_batch_size, + })); - drop(redis_kv_client); - - // `handle_cancel_messages` was terminated due to the tx_cancel - // being dropped. this is not worthy of an error, and this task can only return `Err`, - // so let's wait forever instead. - std::future::pending().await - }); break; } Err(e) => { diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index cce4c1d3a0..036f36c7f6 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,19 +1,23 @@ +use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use std::time::Duration; -use anyhow::{Context, anyhow}; +use anyhow::anyhow; +use futures::FutureExt; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::CancelToken; +use postgres_client::RawCancelToken; use postgres_client::tls::MakeTlsConnect; use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; -use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, error, info, warn}; +use tokio::time::timeout; +use tracing::{debug, error, info}; use crate::auth::AuthError; use crate::auth::backend::ComputeUserInfo; +use crate::batch::{BatchQueue, QueueProcessing}; use crate::config::ComputeConfig; use crate::context::RequestContext; use crate::control_plane::ControlPlaneApi; @@ -27,46 +31,36 @@ use crate::redis::kv_ops::RedisKVClient; type IpSubnetKey = IpNet; -const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time +const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600); +const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570); // Message types for sending through mpsc channel pub enum CancelKeyOp { StoreCancelKey { - key: String, - field: String, - value: String, - resp_tx: Option>>, - _guard: CancelChannelSizeGuard<'static>, - expire: i64, // TTL for key + key: CancelKeyData, + value: Box, + expire: std::time::Duration, }, GetCancelData { - key: String, - resp_tx: oneshot::Sender>>, - _guard: CancelChannelSizeGuard<'static>, - }, - RemoveCancelKey { - key: String, - field: String, - resp_tx: Option>>, - _guard: CancelChannelSizeGuard<'static>, + key: CancelKeyData, }, } pub struct Pipeline { inner: redis::Pipeline, - replies: Vec, + replies: usize, } impl Pipeline { fn with_capacity(n: usize) -> Self { Self { inner: redis::Pipeline::with_capacity(n), - replies: Vec::with_capacity(n), + replies: 0, } } - async fn execute(&mut self, client: &mut RedisKVClient) { - let responses = self.replies.len(); + async fn execute(self, client: &mut RedisKVClient) -> Vec> { + let responses = self.replies; let batch_size = self.inner.len(); match client.query(&self.inner).await { @@ -76,176 +70,73 @@ impl Pipeline { batch_size, responses, "successfully completed cancellation jobs", ); - for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { - reply.send_value(value); - } + values.into_iter().map(Ok).collect() } Ok(value) => { error!(batch_size, ?value, "unexpected redis return value"); - for reply in self.replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } + std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis"))) + .take(responses) + .collect() } Err(err) => { - for reply in self.replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } + std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}"))) + .take(responses) + .collect() } } - - self.inner.clear(); - self.replies.clear(); } - fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + fn add_command_with_reply(&mut self, cmd: Cmd) { self.inner.add_command(cmd); - self.replies.push(reply); + self.replies += 1; } fn add_command_no_reply(&mut self, cmd: Cmd) { self.inner.add_command(cmd).ignore(); } - - fn add_command(&mut self, cmd: Cmd, reply: Option) { - match reply { - Some(reply) => self.add_command_with_reply(cmd, reply), - None => self.add_command_no_reply(cmd), - } - } } impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) { + fn register(&self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { - CancelKeyOp::StoreCancelKey { - key, - field, - value, - resp_tx, - _guard, - expire, - } => { - let reply = - resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); - pipe.add_command(Cmd::hset(&key, field, value), reply); - pipe.add_command_no_reply(Cmd::expire(key, expire)); + CancelKeyOp::StoreCancelKey { key, value, expire } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value)); + pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64)); } - CancelKeyOp::GetCancelData { - key, - resp_tx, - _guard, - } => { - let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; - pipe.add_command_with_reply(Cmd::hgetall(key), reply); - } - CancelKeyOp::RemoveCancelKey { - key, - field, - resp_tx, - _guard, - } => { - let reply = - resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); - pipe.add_command(Cmd::hdel(key, field), reply); + CancelKeyOp::GetCancelData { key } => { + let key = KeyPrefix::Cancel(*key).build_redis_key(); + pipe.add_command_with_reply(Cmd::hget(key, "data")); } } } } -// Message types for sending through mpsc channel -pub enum CancelReplyOp { - StoreCancelKey { - resp_tx: oneshot::Sender>, - _guard: CancelChannelSizeGuard<'static>, - }, - GetCancelData { - resp_tx: oneshot::Sender>>, - _guard: CancelChannelSizeGuard<'static>, - }, - RemoveCancelKey { - resp_tx: oneshot::Sender>, - _guard: CancelChannelSizeGuard<'static>, - }, +pub struct CancellationProcessor { + pub client: RedisKVClient, + pub batch_size: usize, } -impl CancelReplyOp { - fn send_err(self, e: anyhow::Error) { - match self { - CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::GetCancelData { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { - resp_tx - .send(Err(e)) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - } +impl QueueProcessing for CancellationProcessor { + type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp); + type Res = anyhow::Result; + + fn batch_size(&self, _queue_size: usize) -> usize { + self.batch_size } - fn send_value(self, v: redis::Value) { - match self { - CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::GetCancelData { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { - let send = - FromRedisValue::from_owned_redis_value(v).context("could not parse value"); - resp_tx - .send(send) - .inspect_err(|_| tracing::debug!("could not send reply")) - .ok(); - } - } - } -} - -// Running as a separate task to accept messages through the rx channel -pub async fn handle_cancel_messages( - client: &mut RedisKVClient, - mut rx: mpsc::Receiver, - batch_size: usize, -) -> anyhow::Result<()> { - let mut batch = Vec::with_capacity(batch_size); - let mut pipeline = Pipeline::with_capacity(batch_size); - - loop { - if rx.recv_many(&mut batch, batch_size).await == 0 { - warn!("shutting down cancellation queue"); - break Ok(()); - } + async fn apply(&mut self, batch: Vec) -> Vec { + let mut pipeline = Pipeline::with_capacity(batch.len()); let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - for msg in batch.drain(..) { - msg.register(&mut pipeline); + for (_, op) in &batch { + op.register(&mut pipeline); } - pipeline.execute(client).await; + pipeline.execute(&mut self.client).await } } @@ -256,7 +147,7 @@ pub struct CancellationHandler { compute_config: &'static ComputeConfig, // rate limiter of cancellation requests limiter: Arc>>, - tx: Option>, // send messages to the redis KV client task + tx: OnceLock>, // send messages to the redis KV client task } #[derive(Debug, Error)] @@ -296,13 +187,10 @@ impl ReportableError for CancelError { } impl CancellationHandler { - pub fn new( - compute_config: &'static ComputeConfig, - tx: Option>, - ) -> Self { + pub fn new(compute_config: &'static ComputeConfig) -> Self { Self { compute_config, - tx, + tx: OnceLock::new(), limiter: Arc::new(std::sync::Mutex::new( LeakyBucketRateLimiter::::new_with_shards( LeakyBucketRateLimiter::::DEFAULT, @@ -312,7 +200,14 @@ impl CancellationHandler { } } - pub(crate) fn get_key(self: &Arc) -> Session { + pub fn init_tx(&self, queue: BatchQueue) { + self.tx + .set(queue) + .map_err(|_| {}) + .expect("cancellation queue should be registered once"); + } + + pub(crate) fn get_key(self: Arc) -> Session { // we intentionally generate a random "backend pid" and "secret key" here. // we use the corresponding u64 as an identifier for the // actual endpoint+pid+secret for postgres/pgbouncer. @@ -322,14 +217,10 @@ impl CancellationHandler { let key: CancelKeyData = rand::random(); - let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); - let redis_key = prefix_key.build_redis_key(); - debug!("registered new query cancellation key {key}"); Session { key, - redis_key, - cancellation_handler: Arc::clone(self), + cancellation_handler: self, } } @@ -337,62 +228,43 @@ impl CancellationHandler { &self, key: CancelKeyData, ) -> Result, CancelError> { - let prefix_key: KeyPrefix = KeyPrefix::Cancel(key); - let redis_key = prefix_key.build_redis_key(); + let guard = Metrics::get() + .proxy + .cancel_channel_size + .guard(RedisMsgKind::HGet); + let op = CancelKeyOp::GetCancelData { key }; - let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); - let op = CancelKeyOp::GetCancelData { - key: redis_key, - resp_tx, - _guard: Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HGetAll), - }; - - let Some(tx) = &self.tx else { + let Some(tx) = self.tx.get() else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); }; - tx.try_send(op) + const TIMEOUT: Duration = Duration::from_secs(5); + let result = timeout(TIMEOUT, tx.call((guard, op))) + .await + .map_err(|_| { + tracing::warn!("timed out waiting to receive GetCancelData response"); + CancelError::RateLimit + })? .map_err(|e| { - tracing::warn!("failed to send GetCancelData for {key}: {e}"); - }) - .map_err(|()| CancelError::InternalError)?; + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; - let result = resp_rx.await.map_err(|e| { + let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); CancelError::InternalError })?; - let cancel_state_str: Option = match result { - Ok(mut state) => { - if state.len() == 1 { - Some(state.remove(0).1) - } else { - tracing::warn!("unexpected number of entries in cancel state: {state:?}"); - return Err(CancelError::InternalError); - } - } - Err(e) => { - tracing::warn!("failed to receive cancel state from redis: {e}"); - return Err(CancelError::InternalError); - } - }; + let cancel_closure: CancelClosure = + serde_json::from_str(&cancel_state_str).map_err(|e| { + tracing::warn!("failed to deserialize cancel state: {e}"); + CancelError::InternalError + })?; - let cancel_state: Option = match cancel_state_str { - Some(state) => { - let cancel_closure: CancelClosure = serde_json::from_str(&state).map_err(|e| { - tracing::warn!("failed to deserialize cancel state: {e}"); - CancelError::InternalError - })?; - Some(cancel_closure) - } - None => None, - }; - Ok(cancel_state) + Ok(Some(cancel_closure)) } + /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query. @@ -467,10 +339,10 @@ impl CancellationHandler { /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). -#[derive(Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct CancelClosure { socket_addr: SocketAddr, - cancel_token: CancelToken, + cancel_token: RawCancelToken, hostname: String, // for pg_sni router user_info: ComputeUserInfo, } @@ -478,7 +350,7 @@ pub struct CancelClosure { impl CancelClosure { pub(crate) fn new( socket_addr: SocketAddr, - cancel_token: CancelToken, + cancel_token: RawCancelToken, hostname: String, user_info: ComputeUserInfo, ) -> Self { @@ -491,7 +363,7 @@ impl CancelClosure { } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query( - self, + &self, compute_config: &ComputeConfig, ) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; @@ -512,7 +384,6 @@ impl CancelClosure { pub(crate) struct Session { /// The user-facing key identifying this session. key: CancelKeyData, - redis_key: String, cancellation_handler: Arc, } @@ -521,60 +392,66 @@ impl Session { &self.key } - // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) fn write_cancel_key( + /// Ensure the cancel key is continously refreshed, + /// but stop when the channel is dropped. + pub(crate) async fn maintain_cancel_key( &self, - cancel_closure: CancelClosure, - ) -> Result<(), CancelError> { - let Some(tx) = &self.cancellation_handler.tx else { - tracing::warn!("cancellation handler is not available"); - return Err(CancelError::InternalError); - }; + session_id: uuid::Uuid, + cancel: tokio::sync::oneshot::Receiver, + cancel_closure: &CancelClosure, + compute_config: &ComputeConfig, + ) { + futures::future::select( + std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)), + cancel, + ) + .await; - let closure_json = serde_json::to_string(&cancel_closure).map_err(|e| { - tracing::warn!("failed to serialize cancel closure: {e}"); - CancelError::InternalError - })?; - - let op = CancelKeyOp::StoreCancelKey { - key: self.redis_key.clone(), - field: "data".to_string(), - value: closure_json, - resp_tx: None, - _guard: Metrics::get() - .proxy - .cancel_channel_size - .guard(RedisMsgKind::HSet), - expire: CANCEL_KEY_TTL, - }; - - let _ = tx.try_send(op).map_err(|e| { - let key = self.key; - tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); - }); - Ok(()) + if let Err(err) = cancel_closure + .try_cancel_query(compute_config) + .boxed() + .await + { + tracing::warn!( + ?session_id, + ?err, + "could not cancel the query in the database" + ); + } } - pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { - let Some(tx) = &self.cancellation_handler.tx else { + // Ensure the cancel key is continously refreshed. + async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! { + let Some(tx) = self.cancellation_handler.tx.get() else { tracing::warn!("cancellation handler is not available"); - return Err(CancelError::InternalError); + // don't exit, as we only want to exit if cancelled externally. + std::future::pending().await }; - let op = CancelKeyOp::RemoveCancelKey { - key: self.redis_key.clone(), - field: "data".to_string(), - resp_tx: None, - _guard: Metrics::get() + let closure_json = serde_json::to_string(&cancel_closure) + .expect("serialising to json string should not fail") + .into_boxed_str(); + + loop { + let guard = Metrics::get() .proxy .cancel_channel_size - .guard(RedisMsgKind::HDel), - }; + .guard(RedisMsgKind::HSet); + let op = CancelKeyOp::StoreCancelKey { + key: self.key, + value: closure_json.clone(), + expire: CANCEL_KEY_TTL, + }; - let _ = tx.try_send(op).map_err(|e| { - let key = self.key; - tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); - }); - Ok(()) + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "registering cancellation key" + ); + + if tx.call((guard, op)).await.is_ok() { + tokio::time::sleep(CANCEL_KEY_REFRESH).await; + } + } } } diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index aae1fea07d..5dd264b35e 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -9,7 +9,7 @@ use itertools::Itertools; use postgres_client::config::{AuthKeys, SslMode}; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; -use postgres_client::{CancelToken, NoTls, RawConnection}; +use postgres_client::{NoTls, RawCancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use thiserror::Error; use tokio::net::{TcpStream, lookup_host}; @@ -265,7 +265,8 @@ impl ConnectInfo { } } -type RustlsStream = >::Stream; +pub type RustlsStream = >::Stream; +pub type MaybeRustlsStream = MaybeTlsStream; pub(crate) struct PostgresConnection { /// Socket connected to a compute node. @@ -279,7 +280,7 @@ pub(crate) struct PostgresConnection { /// Notices received from compute after authenticating pub(crate) delayed_notice: Vec, - _guage: NumDbConnectionsGuard<'static>, + pub(crate) guage: NumDbConnectionsGuard<'static>, } impl ConnectInfo { @@ -327,8 +328,7 @@ impl ConnectInfo { // Yet another reason to rework the connection establishing code. let cancel_closure = CancelClosure::new( socket_addr, - CancelToken { - socket_config: None, + RawCancelToken { ssl_mode: self.ssl_mode, process_id, secret_key, @@ -343,7 +343,7 @@ impl ConnectInfo { delayed_notice, cancel_closure, aux, - _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), + guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; Ok(connection) diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 5331ea41fd..89adfc9049 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -120,7 +120,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { error!( @@ -232,22 +232,35 @@ pub(crate) async fn handle_client( .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session = cancellation_handler_clone.get_key(); - - session.write_cancel_key(node.cancel_closure.clone())?; + let session = cancellation_handler.get_key(); prepare_client_connection(&node, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; + let session_id = ctx.session_id(); + let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + session + .maintain_cancel_key( + session_id, + cancel, + &node.cancel_closure, + &config.connect_to_compute, + ) + .await; + }); + Ok(Some(ProxyPassthrough { client: stream, - aux: node.aux.clone(), + compute: node.stream, + + aux: node.aux, private_link_id: None, - compute: node, - session_id: ctx.session_id(), - cancel: session, + + _cancel_on_shutdown: cancel_on_shutdown, + _req: request_gauge, _conn: conn_gauge, + _db_conn: node.guage, })) } diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 026c6aeba9..d96f582fad 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -75,6 +75,7 @@ pub mod binary; mod auth; +mod batch; mod cache; mod cancellation; mod compute; diff --git a/proxy/src/pglb/passthrough.rs b/proxy/src/pglb/passthrough.rs index 6f651d383d..d4c029f6d9 100644 --- a/proxy/src/pglb/passthrough.rs +++ b/proxy/src/pglb/passthrough.rs @@ -1,15 +1,17 @@ -use futures::FutureExt; +use std::convert::Infallible; + use smol_str::SmolStr; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::debug; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; -use crate::cancellation; -use crate::compute::PostgresConnection; -use crate::config::ComputeConfig; +use crate::compute::MaybeRustlsStream; use crate::control_plane::messages::MetricsAuxInfo; -use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::metrics::{ + Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard, + NumDbConnectionsGuard, +}; use crate::stream::Stream; use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; @@ -64,40 +66,20 @@ pub(crate) async fn proxy_pass( pub(crate) struct ProxyPassthrough { pub(crate) client: Stream, - pub(crate) compute: PostgresConnection, + pub(crate) compute: MaybeRustlsStream, + pub(crate) aux: MetricsAuxInfo, - pub(crate) session_id: uuid::Uuid, pub(crate) private_link_id: Option, - pub(crate) cancel: cancellation::Session, + + pub(crate) _cancel_on_shutdown: tokio::sync::oneshot::Sender, pub(crate) _req: NumConnectionRequestsGuard<'static>, pub(crate) _conn: NumClientConnectionsGuard<'static>, + pub(crate) _db_conn: NumDbConnectionsGuard<'static>, } impl ProxyPassthrough { - pub(crate) async fn proxy_pass( - self, - compute_config: &ComputeConfig, - ) -> Result<(), ErrorSource> { - let res = proxy_pass( - self.client, - self.compute.stream, - self.aux, - self.private_link_id, - ) - .await; - if let Err(err) = self - .compute - .cancel_closure - .try_cancel_query(compute_config) - .boxed() - .await - { - tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); - } - - drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error - - res + pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { + proxy_pass(self.client, self.compute, self.aux, self.private_link_id).await } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 4211406f6c..7da1b8d8fa 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -155,7 +155,7 @@ pub async fn task_main( Ok(Some(p)) => { ctx.set_success(); let _disconnect = ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => {} Err(ErrorSource::Client(e)) => { warn!( @@ -372,13 +372,24 @@ pub(crate) async fn handle_client( Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let session = cancellation_handler_clone.get_key(); + let session = cancellation_handler.get_key(); - session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; + let session_id = ctx.session_id(); + let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + session + .maintain_cancel_key( + session_id, + cancel, + &node.cancel_closure, + &config.connect_to_compute, + ) + .await; + }); + let private_link_id = match ctx.extra() { Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), @@ -387,13 +398,16 @@ pub(crate) async fn handle_client( Ok(Some(ProxyPassthrough { client: stream, - aux: node.aux.clone(), + compute: node.stream, + + aux: node.aux, private_link_id, - compute: node, - session_id: ctx.session_id(), - cancel: session, + + _cancel_on_shutdown: cancel_on_shutdown, + _req: request_gauge, _conn: conn_gauge, + _db_conn: node.guage, })) } diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index 3113bad949..b453e6851c 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -1,8 +1,4 @@ -use std::io::ErrorKind; - -use anyhow::Ok; - -use crate::pqproto::{CancelKeyData, id_to_cancel_key}; +use crate::pqproto::CancelKeyData; pub mod keyspace { pub const CANCEL_PREFIX: &str = "cancel"; @@ -23,39 +19,12 @@ impl KeyPrefix { } } } - - #[allow(dead_code)] - pub(crate) fn as_str(&self) -> &'static str { - match self { - KeyPrefix::Cancel(_) => keyspace::CANCEL_PREFIX, - } - } -} - -#[allow(dead_code)] -pub(crate) fn parse_redis_key(key: &str) -> anyhow::Result { - let (prefix, key_str) = key.split_once(':').ok_or_else(|| { - anyhow::anyhow!(std::io::Error::new( - ErrorKind::InvalidData, - "missing prefix" - )) - })?; - - match prefix { - keyspace::CANCEL_PREFIX => { - let id = u64::from_str_radix(key_str, 16)?; - - Ok(KeyPrefix::Cancel(id_to_cancel_key(id))) - } - _ => Err(anyhow::anyhow!(std::io::Error::new( - ErrorKind::InvalidData, - "unknown prefix" - ))), - } } #[cfg(test)] mod tests { + use crate::pqproto::id_to_cancel_key; + use super::*; #[test] @@ -65,16 +34,4 @@ mod tests { let redis_key = cancel_key.build_redis_key(); assert_eq!(redis_key, "cancel:30390000d431"); } - - #[test] - fn test_parse_redis_key() { - let redis_key = "cancel:30390000d431"; - let key: KeyPrefix = parse_redis_key(redis_key).expect("Failed to parse key"); - - let ref_key = id_to_cancel_key(12345 << 32 | 54321); - - assert_eq!(key.as_str(), KeyPrefix::Cancel(ref_key).as_str()); - let KeyPrefix::Cancel(cancel_key) = key; - assert_eq!(ref_key, cancel_key); - } } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index f71730c533..f8d3b5cc66 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -1,3 +1,6 @@ +use std::time::Duration; + +use futures::FutureExt; use redis::aio::ConnectionLike; use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; @@ -35,14 +38,11 @@ impl RedisKVClient { } pub async fn try_connect(&mut self) -> anyhow::Result<()> { - match self.client.connect().await { - Ok(()) => {} - Err(e) => { - tracing::error!("failed to connect to redis: {e}"); - return Err(e); - } - } - Ok(()) + self.client + .connect() + .boxed() + .await + .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) } pub(crate) async fn query( @@ -54,15 +54,25 @@ impl RedisKVClient { return Err(anyhow::anyhow!("Rate limit exceeded")); } - match q.query(&mut self.client).await { + let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), - Err(e) => { - tracing::error!("failed to run query: {e}"); + Err(e) => e, + }; + + tracing::error!("failed to run query: {e}"); + match e.retry_method() { + redis::RetryMethod::Reconnect => { + tracing::info!("Redis client is disconnected. Reconnecting..."); + self.try_connect().await?; } + redis::RetryMethod::RetryImmediately => {} + redis::RetryMethod::WaitAndRetry => { + // somewhat arbitrary. + tokio::time::sleep(Duration::from_millis(100)).await; + } + _ => Err(e)?, } - tracing::info!("Redis client is disconnected. Reconnecting..."); - self.try_connect().await?; Ok(q.query(&mut self.client).await?) } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 8648a94869..0d374e6df2 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -167,7 +167,7 @@ pub(crate) async fn serve_websocket( Ok(Some(p)) => { ctx.set_success(); ctx.log_connect(); - match p.proxy_pass(&config.connect_to_compute).await { + match p.proxy_pass().await { Ok(()) => Ok(()), Err(ErrorSource::Client(err)) => Err(err).context("client"), Err(ErrorSource::Compute(err)) => Err(err).context("compute"), diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 2eea2f9d10..fc0ba9f28c 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -207,6 +207,12 @@ struct Cli { /// the compute notification directly (instead of via control plane). #[arg(long, default_value = "false")] use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + /// This option exists primarily for testing purposes. + #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] + timeline_safekeeper_count: i64, } enum StrictMode { @@ -371,6 +377,11 @@ async fn async_main() -> anyhow::Result<()> { StrictMode::Strict if args.use_local_compute_notifications => { anyhow::bail!("`--use-local-compute-notifications` is only permitted in `--dev` mode"); } + StrictMode::Strict if args.timeline_safekeeper_count < 3 => { + anyhow::bail!( + "Running with less than 3 safekeepers per timeline is only permitted in `--dev` mode" + ); + } StrictMode::Strict => { tracing::info!("Starting in strict mode: configuration is OK.") } @@ -433,6 +444,7 @@ async fn async_main() -> anyhow::Result<()> { ssl_ca_certs, timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, + timeline_safekeeper_count: args.timeline_safekeeper_count, }; // Validate that we can connect to the database diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 14c81ccf59..6ec3963c48 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -466,6 +466,10 @@ pub struct Config { pub timelines_onto_safekeepers: bool, pub use_local_compute_notifications: bool, + + /// Number of safekeepers to choose for a timeline when creating it. + /// Safekeepers will be choosen from different availability zones. + pub timeline_safekeeper_count: i64, } impl From for ApiError { diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 61b9ec6b6d..193a1833a7 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,3 +1,4 @@ +use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -608,7 +609,8 @@ impl Service { Ok(()) } - /// Choose safekeepers for the new timeline: 3 in different azs. + /// Choose safekeepers for the new timeline in different azs. + /// 3 are choosen by default, but may be configured via config (for testing). pub(crate) async fn safekeepers_for_new_timeline( &self, ) -> Result, ApiError> { @@ -651,18 +653,14 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let wanted_count = match all_safekeepers.len() { - 0 => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find any active safekeeper for new timeline", - ))); - } - // Have laxer requirements on testig mode as we don't want to - // spin up three safekeepers for every single test - #[cfg(feature = "testing")] - 1 | 2 => all_safekeepers.len(), - _ => 3, - }; + let mut wanted_count = self.config.timeline_safekeeper_count as usize; + // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release + // branch and is specified in tests/neon_local config. + if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { + // In testing mode, we can have less safekeepers than the config says + wanted_count = max(all_safekeepers.len(), 1); + } + let mut sks = Vec::new(); let mut azs = HashSet::new(); for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index e2d405227b..f5be544439 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -24,7 +24,7 @@ The value to place in the `aud` claim. @final class ComputeClaimsScope(StrEnum): - ADMIN = "admin" + ADMIN = "compute_ctl:admin" @final diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8cf1020adb..050d61055e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -489,7 +489,9 @@ class NeonEnvBuilder: self.config_init_force: str | None = None self.top_output_dir = top_output_dir self.control_plane_hooks_api: str | None = None - self.storage_controller_config: dict[Any, Any] | None = None + self.storage_controller_config: dict[Any, Any] | None = { + "timelines_onto_safekeepers": True, + } # Flag to enable https listener in pageserver, generate local ssl certs, # and force storage controller to use https for pageserver api. @@ -4909,6 +4911,9 @@ class Safekeeper(LogUtils): log.info(f"finished pulling timeline from {src_ids} to {self.id}") return res + def safekeeper_id(self) -> SafekeeperId: + return SafekeeperId(self.id, "localhost", self.port.pg_tenant_only) + @property def data_dir(self) -> Path: return self.env.repo_dir / "safekeepers" / f"sk{self.id}" diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index dc44fc77db..7788faceb4 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -184,7 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "timeline_offloading": False, "rel_size_v2_enabled": True, "relsize_snapshot_cache_capacity": 10000, - "gc_compaction_enabled": True, + "gc_compaction_enabled": False, "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9ce618b2ad..fa5c9aa693 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -11,6 +11,7 @@ from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active +from fixtures.safekeeper.http import MembershipConfiguration, TimelineCreateRequest from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException @@ -164,6 +165,19 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE ps_http.configure_failpoints(("before-upload-index-pausable", "pause")) env.pageserver.tenant_create(env.initial_tenant) + sk = env.safekeepers[0] + assert sk + sk.http_client().timeline_create( + TimelineCreateRequest( + env.initial_tenant, + env.initial_timeline, + MembershipConfiguration(generation=1, members=[sk.safekeeper_id()], new_members=None), + int(env.pg_version), + Lsn(0), + None, + ) + ) + initial_branch = "initial_branch" def start_creating_timeline(): diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 44590ea4b9..3335cf686c 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -64,6 +64,11 @@ def test_normal_work( """ neon_env_builder.num_safekeepers = num_safekeepers + + if safekeeper_proto_version == 2: + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 2590a3fe9d..2b71662669 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -671,6 +671,12 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu """ neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + # On the new mode, the test runs into a cancellation issue, i.e. the walproposer can't shut down + # as it is hang-waiting on the timeline_checkpoint call in WalIngest::new. + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + # turn off background tasks so that they don't interfere with the downloads env = neon_env_builder.init_start( initial_tenant_conf={ diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 3695ece66b..728241b465 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -173,7 +173,11 @@ def test_pg_regress( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress" src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "parallel_schedule" @@ -250,7 +254,11 @@ def test_isolation( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. - build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/isolation" src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "isolation_schedule" @@ -314,8 +322,11 @@ def test_sql_regress( (runpath / "testtablespace").mkdir(parents=True) # Compute all the file locations that pg_regress will need. - # This test runs neon specific tests - build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + # + # XXX: We assume that the `build` directory is a sibling of the + # pg_distrib_dir. That is the default when you check out the + # repository; `build` and `pg_install` are created side by side. + build_path = pg_distrib_dir / f"../build/{env.pg_version.v_prefixed}/src/test/regress" src_path = base_dir / "test_runner/sql_regress" bindir = pg_distrib_dir / f"v{env.pg_version}/bin" schedule = src_path / "parallel_schedule" diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 082808f9ff..2d7be1f9d1 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -74,7 +74,7 @@ def test_tenant_s3_restore( last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) last_flush_lsns.append(last_flush_lsn) ps_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn, timeout=60) log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}") parent = timeline diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 8f3aa010e3..74ba74645e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -88,6 +88,12 @@ def test_storage_controller_smoke( neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() + # These bubble up from safekeepers + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) + # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.start() env.storage_controller.start() @@ -3455,7 +3461,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert target.get_safekeeper(fake_id) is None - assert len(target.get_safekeepers()) == 0 + start_sks = target.get_safekeepers() sk_0 = env.safekeepers[0] @@ -3477,7 +3483,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): inserted = target.get_safekeeper(fake_id) assert inserted is not None - assert target.get_safekeepers() == [inserted] + assert target.get_safekeepers() == start_sks + [inserted] assert eq_safekeeper_records(body, inserted) # error out if pk is changed (unexpected) @@ -3489,7 +3495,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): assert exc.value.status_code == 400 inserted_again = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_again] + assert target.get_safekeepers() == start_sks + [inserted_again] assert inserted_again is not None assert eq_safekeeper_records(inserted, inserted_again) @@ -3498,7 +3504,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["version"] += 1 target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) @@ -3507,7 +3513,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["https_port"] = 123 target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) env.storage_controller.consistency_check() @@ -3516,7 +3522,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): body["https_port"] = None target.on_safekeeper_deploy(fake_id, body) inserted_now = target.get_safekeeper(fake_id) - assert target.get_safekeepers() == [inserted_now] + assert target.get_safekeepers() == start_sks + [inserted_now] assert inserted_now is not None assert eq_safekeeper_records(body, inserted_now) env.storage_controller.consistency_check() @@ -3635,6 +3641,9 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi env = neon_env_builder.init_configs() env.start() + for ps in env.pageservers: + ps.allowed_errors.append(".*Timeline.* has been deleted.*") + tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 03cd133ccb..e29cb801d5 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -341,6 +341,11 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_configs() env.start() + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) + tenant_id = TenantId.generate() timeline_id = TimelineId.generate() env.create_tenant( diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index f0810270b1..c58f78aeb1 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -21,7 +21,10 @@ from fixtures.neon_fixtures import ( last_flush_lsn_upload, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.http import ( + HistoricLayerInfo, + PageserverApiException, +) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_timeline_detail_404 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.utils import assert_pageserver_backups_equal, skip_in_debug_build, wait_until @@ -413,6 +416,7 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots "read_only": True, }, ) + sk = env.safekeepers[0] assert sk with pytest.raises(requests.exceptions.HTTPError, match="Not Found"): @@ -504,8 +508,15 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots assert len(lineage.get("original_ancestor", [])) == 0 assert len(lineage.get("reparenting_history", [])) == 0 - for name, _, _, rows, starts in expected_result: - with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + for branch_name, queried_timeline, _, rows, starts in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + log.info(f"reading data from branch {branch_name}") + # specifying the lsn makes the endpoint read-only and not connect to safekeepers + with env.endpoints.create( + branch_name, + lsn=Lsn(details["last_record_lsn"]), + ) as ep: + ep.start(safekeeper_generation=1) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 @@ -1088,6 +1099,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append(".*Timeline.* has been deleted.*") pageservers = dict((int(p.id), p) for p in env.pageservers) @@ -1209,6 +1221,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + ps.allowed_errors.append(".*Timeline.* has been deleted.*") pageservers = dict((int(p.id), p) for p in env.pageservers) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 9a710f5b80..8ef64a0742 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -24,6 +24,8 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool initial_tenant_conf={"gc_period": "1s", "lsn_lease_length": "0s"}, initial_tenant_shard_count=2 if sharded else None, ) + for ps in env.pageservers: + ps.allowed_errors.append(".*Timeline.* has been deleted.*") if sharded: http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index b9183286af..ea120c1814 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -229,7 +229,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # Test timeline_list endpoint. http_cli = env.safekeepers[0].http_client() - assert len(http_cli.timeline_list()) == 3 + assert len(http_cli.timeline_list()) == 4 # Check that dead minority doesn't prevent the commits: execute insert n_inserts @@ -740,8 +740,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.create_branch("test_timeline_status") - endpoint = env.endpoints.create_start("test_timeline_status") + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start("main") wa = env.safekeepers[0] @@ -1292,6 +1292,12 @@ def test_lagging_sk(neon_env_builder: NeonEnvBuilder): # it works without compute at all. def test_peer_recovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + + # timelines should be created the old way + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -1532,6 +1538,11 @@ def test_safekeeper_without_pageserver( def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): + # timelines should be created the old way manually until we have migration support + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -1661,6 +1672,15 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool): res = env.safekeepers[3].pull_timeline( [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id ) + sk_id_1 = env.safekeepers[0].safekeeper_id() + sk_id_3 = env.safekeepers[2].safekeeper_id() + sk_id_4 = env.safekeepers[3].safekeeper_id() + new_conf = MembershipConfiguration( + generation=2, members=[sk_id_1, sk_id_3, sk_id_4], new_members=None + ) + for i in [0, 2, 3]: + env.safekeepers[i].http_client().membership_switch(tenant_id, timeline_id, new_conf) + log.info("Finished pulling timeline") log.info(res) @@ -1705,13 +1725,15 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + dst_sk.stop() + + [tenant_id, timeline_id] = env.create_tenant() + log.info("use only first 2 safekeepers, 3rd will be seeded") - endpoint = env.endpoints.create("main") + endpoint = env.endpoints.create("main", tenant_id=tenant_id) endpoint.active_safekeepers = [1, 2] endpoint.start() endpoint.safe_psql("create table t(key int, value text)") @@ -1723,6 +1745,7 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): src_http = src_sk.http_client() # run pull_timeline which will halt before downloading files src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + dst_sk.start() pt_handle = PropagatingThread( target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) ) @@ -1782,23 +1805,27 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.initial_timeline (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + dst_sk.stop() + src_http = src_sk.http_client() + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + + timeline_id = env.create_branch("pull_timeline_term_changes") + + # run pull_timeline which will halt before downloading files log.info("use only first 2 safekeepers, 3rd will be seeded") - ep = env.endpoints.create("main") + ep = env.endpoints.create("pull_timeline_term_changes") ep.active_safekeepers = [1, 2] ep.start() ep.safe_psql("create table t(key int, value text)") ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'") - src_http = src_sk.http_client() - # run pull_timeline which will halt before downloading files - src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) pt_handle = PropagatingThread( target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) ) + dst_sk.start() pt_handle.start() src_sk.wait_until_paused("sk-snapshot-after-list-pausable") @@ -1807,7 +1834,7 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): # restart compute to bump term ep.stop() - ep = env.endpoints.create("main") + ep = env.endpoints.create("pull_timeline_term_changes") ep.active_safekeepers = [1, 2] ep.start() ep.safe_psql("insert into t select generate_series(1, 100), 'pear'") @@ -1929,6 +1956,11 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): @run_only_on_default_postgres("tests only safekeeper API") def test_membership_api(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 + # timelines should be created the old way + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() # These are expected after timeline deletion on safekeepers. @@ -2009,6 +2041,12 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): created manually, later storcon will do that. """ neon_env_builder.num_safekeepers = 3 + + # timelines should be created the old way manually + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() tenant_id = env.initial_tenant @@ -2064,7 +2102,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.create_branch("test_idle_reconnections") + timeline_id = env.initial_timeline def collect_stats() -> dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -2095,7 +2133,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_idle_reconnections") + endpoint = env.endpoints.create_start("main") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index d8a7dc2a2b..1bad387a90 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -590,6 +590,13 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): @pytest.mark.parametrize("safekeeper_proto_version", [2, 3]) def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int): neon_env_builder.num_safekeepers = 3 + if safekeeper_proto_version == 2: + # On the legacy protocol, we don't support generations, which are part of + # `timelines_onto_safekeepers` + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + env = neon_env_builder.init_start() asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) @@ -713,6 +720,11 @@ async def run_quorum_sanity(env: NeonEnv): # we don't. def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 + + # The test fails basically always on the new mode. + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } env = neon_env_builder.init_start() asyncio.run(run_quorum_sanity(env)) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 0252b590cc..d281c055b0 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -16,6 +16,13 @@ if TYPE_CHECKING: # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. # Ensures that walreceiver does not run without any data inserted and only starts after the insertion. def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): + # we assert below that the walreceiver is not active before data writes. + # with manually created timelines, it is active. + # FIXME: remove this test once we remove timelines_onto_safekeepers + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": False, + } + # Trigger WAL wait timeout faster neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'" env = neon_env_builder.init_start() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 6770bc2513..9085654ee8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 6770bc251301ef40c66f7ecb731741dc435b5051 +Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f diff --git a/vendor/revisions.json b/vendor/revisions.json index 12d5499ddb..b260698c86 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -13,6 +13,6 @@ ], "v14": [ "14.18", - "6770bc251301ef40c66f7ecb731741dc435b5051" + "9085654ee8022d5cc4ca719380a1dc53e5e3246f" ] }