diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 01f5c3ede9..cd95a5b16d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -728,30 +728,6 @@ jobs: tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - - name: Build compute-tools image - # compute-tools are Postgres independent, so build it only once - # We pick 16, because that builds on debian 11 with older glibc (and is - # thus compatible with newer glibc), rather than 17 on Debian 12, as - # that isn't guaranteed to be compatible with Debian 11 - if: matrix.version.pg == 'v16' - uses: docker/build-push-action@v6 - with: - target: compute-tools-image - context: . - build-args: | - GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} - DEBIAN_VERSION=${{ matrix.version.debian }} - provenance: false - push: true - pull: true - file: compute/compute-node.Dockerfile - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} - tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - compute-node-image: needs: [ compute-node-image-arch, tag ] permissions: @@ -794,14 +770,6 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Create multi-arch compute-tools image - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: @@ -817,12 +785,6 @@ jobs: docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - - name: Push multi-arch compute-tools image to ECR - if: matrix.version.pg == 'v16' - run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} - vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] runs-on: [ self-hosted, large ] @@ -1001,9 +963,6 @@ jobs: docker buildx imagetools create -t $repo/neon:latest \ $repo/neon:${{ needs.tag.outputs.build-tag }} - docker buildx imagetools create -t $repo/compute-tools:latest \ - $repo/compute-tools:${{ needs.tag.outputs.build-tag }} - for version in ${VERSIONS}; do docker buildx imagetools create -t $repo/compute-node-${version}:latest \ $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} @@ -1032,7 +991,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do + for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -1044,7 +1003,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -1056,7 +1015,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 + images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} diff --git a/Cargo.lock b/Cargo.lock index f727741883..1e29f4fc08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1605,6 +1605,32 @@ dependencies = [ "typenum", ] +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "darling" version = "0.20.1" @@ -1875,6 +1901,28 @@ dependencies = [ "spki 0.7.3", ] +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871" +dependencies = [ + "curve25519-dalek", + "ed25519", + "rand_core 0.6.4", + "sha2", + "subtle", +] + [[package]] name = "either" version = "1.8.1" @@ -2113,6 +2161,12 @@ dependencies = [ "subtle", ] +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.22" @@ -3990,6 +4044,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "postgres_initdb", + "pprof", "pq_proto", "procfs", "rand 0.8.5", @@ -4745,6 +4800,7 @@ dependencies = [ "consumption_metrics", "dashmap 5.5.0", "ecdsa 0.16.9", + "ed25519-dalek", "env_logger 0.10.2", "fallible-iterator", "flate2", diff --git a/Dockerfile b/Dockerfile index d3659f917a..2e4f8e5546 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,6 +71,7 @@ RUN set -e \ ca-certificates \ # System postgres for use with client libraries (e.g. in storage controller) postgresql-15 \ + openssl \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ && chown -R neon:neon /data diff --git a/Makefile b/Makefile index 9cffc74508..22ebfea7d5 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package managers POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ -OPENSSL_PREFIX_DIR := /usr/local/openssl ICU_PREFIX_DIR := /usr/local/icu # @@ -26,11 +25,9 @@ endif ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) # Exclude static build openssl, icu for local build (MacOS, Linux) # Only keep for build type release and debug - PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include PG_CONFIGURE_OPTS += --with-icu PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION' PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm' - PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread' endif UNAME_S := $(shell uname -s) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 79210a2e1b..7a2ec9c43e 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -115,7 +115,7 @@ RUN set -e \ # Keep the version the same as in compute/compute-node.Dockerfile and # test_runner/regress/test_compute_metrics.py. -ENV SQL_EXPORTER_VERSION=0.16.0 +ENV SQL_EXPORTER_VERSION=0.17.0 RUN curl -fsSL \ "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \ --output sql_exporter.tar.gz \ @@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && make install \ && rm -rf ../lcov.tar.gz -# Compile and install the static OpenSSL library -ENV OPENSSL_VERSION=1.1.1w -ENV OPENSSL_PREFIX=/usr/local/openssl -RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ - echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ - cd /tmp && \ - tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - cd /tmp/openssl-${OPENSSL_VERSION} && \ - ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ - make -j "$(nproc)" && \ - make install && \ - cd /tmp && \ - rm -rf /tmp/openssl-${OPENSSL_VERSION} - # Use the same version of libicu as the compute nodes so that # clusters created using inidb on pageserver can be used by computes. # diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 303daec240..299f4444a3 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -104,16 +104,18 @@ RUN cd postgres && \ esac; \ done; +# Set PATH for all the subsequent build steps +ENV PATH="/usr/local/pgsql/bin:$PATH" + ######################################################################################### # # Layer "postgis-build" # Build PostGIS from the upstream PostGIS mirror. # ######################################################################################### -FROM build-deps AS postgis-build +FROM pg-build AS postgis-build ARG DEBIAN_VERSION ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ @@ -151,8 +153,6 @@ RUN case "${DEBIAN_VERSION}" in \ DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ ninja clean && cp -R /sfcgal/* / -ENV PATH="/usr/local/pgsql/bin:$PATH" - # Postgis 3.5.0 supports v17 RUN case "${PG_VERSION}" in \ "v17") \ @@ -170,7 +170,6 @@ RUN case "${PG_VERSION}" in \ wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \ echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -220,11 +219,7 @@ RUN case "${PG_VERSION}" in \ cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ ninja -j $(getconf _NPROCESSORS_ONLN) && \ ninja -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T - + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control ######################################################################################### # @@ -232,9 +227,8 @@ RUN case "${PG_VERSION}" in \ # Build plv8 # ######################################################################################### -FROM build-deps AS plv8-build +FROM pg-build AS plv8-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch @@ -269,7 +263,6 @@ RUN case "${PG_VERSION}" in \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ @@ -296,9 +289,8 @@ RUN case "${PG_VERSION}" in \ # Build h3_pg # ######################################################################################### -FROM build-deps AS h3-pg-build +FROM pg-build AS h3-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v4.1.0 - Jan 18, 2023 @@ -319,7 +311,6 @@ RUN mkdir -p /h3/usr/ && \ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ - export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ @@ -331,17 +322,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # compile unit extension # ######################################################################################### -FROM build-deps AS unit-pg-build +FROM pg-build AS unit-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release 7.9 - Sep 15, 2024 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \ echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. # We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path. # This one-liner removes pgsql/ part of the path. @@ -355,9 +345,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz - # compile pgvector extension # ######################################################################################### -FROM build-deps AS vector-pg-build +FROM pg-build AS vector-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/pgvector.patch /pgvector.patch @@ -371,8 +360,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### @@ -381,16 +370,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O # compile pgjwt extension # ######################################################################################### -FROM build-deps AS pgjwt-pg-build +FROM pg-build AS pgjwt-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control ######################################################################################### @@ -399,17 +387,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71 # compile hypopg extension # ######################################################################################### -FROM build-deps AS hypopg-pg-build +FROM pg-build AS hypopg-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # HypoPG 1.4.1 supports v17 # last release 1.4.1 - Apr 28, 2024 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control ######################################################################################### @@ -418,17 +405,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo # compile pg_hashids extension # ######################################################################################### -FROM build-deps AS pg-hashids-pg-build +FROM pg-build AS pg-hashids-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.1 -Jan 12, 2018 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control ######################################################################################### @@ -437,9 +423,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # compile rum extension # ######################################################################################### -FROM build-deps AS rum-pg-build +FROM pg-build AS rum-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch @@ -450,8 +435,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control ######################################################################################### @@ -460,17 +445,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea # compile pgTAP extension # ######################################################################################### -FROM build-deps AS pgtap-pg-build +FROM pg-build AS pgtap-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pgtap 1.3.3 supports v17 # last release v1.3.3 - Apr 8, 2024 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control ######################################################################################### @@ -479,17 +463,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta # compile ip4r extension # ######################################################################################### -FROM build-deps AS ip4r-pg-build +FROM pg-build AS ip4r-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.4.2 - Jul 29, 2023 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control ######################################################################################### @@ -498,17 +481,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # compile Prefix extension # ######################################################################################### -FROM build-deps AS prefix-pg-build +FROM pg-build AS prefix-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.2.10 - Jul 5, 2023 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control ######################################################################################### @@ -517,17 +499,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # compile hll extension # ######################################################################################### -FROM build-deps AS hll-pg-build +FROM pg-build AS hll-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v2.18 - Aug 29, 2023 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control ######################################################################################### @@ -536,17 +517,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # compile plpgsql_check extension # ######################################################################################### -FROM build-deps AS plpgsql-check-pg-build +FROM pg-build AS plpgsql-check-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # plpgsql_check v2.7.11 supports v17 # last release v2.7.11 - Sep 16, 2024 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \ + make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control ######################################################################################### @@ -555,11 +535,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz # compile timescaledb extension # ######################################################################################### -FROM build-deps AS timescaledb-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM pg-build AS timescaledb-pg-build ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -590,11 +567,8 @@ RUN case "${PG_VERSION}" in \ # compile pg_hint_plan extension # ######################################################################################### -FROM build-deps AS pg-hint-plan-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - +FROM pg-build AS pg-hint-plan-pg-build ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin:$PATH" # version-specific, has separate releases for each version RUN case "${PG_VERSION}" in \ @@ -632,14 +606,12 @@ RUN case "${PG_VERSION}" in \ # compile pg_cron extension # ######################################################################################### -FROM build-deps AS pg-cron-pg-build +FROM pg-build AS pg-cron-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension that we do not support on prod yet. # !Do not remove! # We set it in shared_preload_libraries and computes will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \ echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ @@ -653,9 +625,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O # compile rdkit extension # ######################################################################################### -FROM build-deps AS rdkit-pg-build +FROM pg-build AS rdkit-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y \ @@ -673,7 +644,13 @@ RUN apt update && \ # Use new version only for v17 # because Release_2024_09_1 has some backward incompatible changes # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 -ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" + +# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find +# pg_config. For some reason the rdkit cmake script doesn't work with just that, +# however. By also adding /usr/local/pgsql, it works, which is weird because there +# are no executables in that directory. +ENV PATH="/usr/local/pgsql:$PATH" + RUN case "${PG_VERSION}" in \ "v17") \ export RDKIT_VERSION=Release_2024_09_1 \ @@ -726,13 +703,11 @@ RUN case "${PG_VERSION}" in \ # compile pg_uuidv7 extension # ######################################################################################### -FROM build-deps AS pg-uuidv7-pg-build +FROM pg-build AS pg-uuidv7-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v1.6.0 - Oct 9, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ @@ -746,13 +721,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz # compile pg_roaringbitmap extension # ######################################################################################### -FROM build-deps AS pg-roaringbitmap-pg-build +FROM pg-build AS pg-roaringbitmap-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific # last release v0.5.4 - Jun 28, 2022 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ @@ -766,16 +739,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # compile pg_semver extension # ######################################################################################### -FROM build-deps AS pg-semver-pg-build +FROM pg-build AS pg-semver-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # Release 0.40.0 breaks backward compatibility with previous versions # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0 # Use new version only for v17 # # last release v0.40.0 - Jul 22, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v17") \ export SEMVER_VERSION=0.40.0 \ @@ -802,13 +773,11 @@ RUN case "${PG_VERSION}" in \ # compile pg_embedding extension # ######################################################################################### -FROM build-deps AS pg-embedding-pg-build -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +FROM pg-build AS pg-embedding-pg-build # This is our extension, support stopped in favor of pgvector # TODO: deprecate it ARG PG_VERSION -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ @@ -829,26 +798,19 @@ RUN case "${PG_VERSION}" in \ # compile anon extension # ######################################################################################### -FROM build-deps AS pg-anon-pg-build +FROM pg-build AS pg-anon-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # This is an experimental extension, never got to real production. # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ esac && \ wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ - find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ - mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \ - sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ - comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T - + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control ######################################################################################### # @@ -856,9 +818,8 @@ RUN case "${PG_VERSION}" in "v17") \ # This layer is used to build `pgrx` deps # ######################################################################################### -FROM build-deps AS rust-extensions-build +FROM pg-build AS rust-extensions-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ @@ -866,7 +827,7 @@ RUN apt update && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot @@ -893,9 +854,8 @@ USER root # and eventually get merged with `rust-extensions-build` # ######################################################################################### -FROM build-deps AS rust-extensions-build-pgrx12 +FROM pg-build AS rust-extensions-build-pgrx12 ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ @@ -903,7 +863,7 @@ RUN apt update && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot -ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +ENV PATH="/home/nonroot/.cargo/bin:$PATH" USER nonroot WORKDIR /home/nonroot @@ -976,22 +936,9 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build ARG PG_VERSION -# version 0.3.3 supports v17 # last release v0.3.3 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_JSONSCHEMA_VERSION=0.3.3 \ - export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \ - echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \ + echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 # `unsafe-postgres` feature allows to build pgx extensions @@ -1012,22 +959,9 @@ RUN case "${PG_VERSION}" in \ FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build ARG PG_VERSION -# version 1.5.9 supports v17 # last release v1.5.9 - Oct 16, 2024 -# -# there were no breaking changes -# so we can use the same version for all postgres versions -RUN case "${PG_VERSION}" in \ - "v14" | "v15" | "v16" | "v17") \ - export PG_GRAPHQL_VERSION=1.5.9 \ - export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \ - echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \ + echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ @@ -1091,8 +1025,8 @@ ARG PG_VERSION # NOTE: local_proxy depends on the version of pg_session_jwt # Do not update without approve from proxy team # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs -RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \ - echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release @@ -1104,13 +1038,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2 # ######################################################################################### -FROM build-deps AS wal2json-pg-build +FROM pg-build AS wal2json-pg-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # wal2json wal2json_2_6 supports v17 # last release wal2json_2_6 - Apr 25, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ @@ -1123,13 +1055,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar. # compile pg_ivm extension # ######################################################################################### -FROM build-deps AS pg-ivm-build +FROM pg-build AS pg-ivm-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # pg_ivm v1.9 supports v17 # last release v1.9 - Jul 31 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ @@ -1143,13 +1073,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv # compile pg_partman extension # ######################################################################################### -FROM build-deps AS pg-partman-build +FROM pg-build AS pg-partman-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # should support v17 https://github.com/pgpartman/pg_partman/discussions/693 # last release 5.1.0 Apr 2, 2024 -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ @@ -1165,9 +1093,6 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz ######################################################################################### FROM rust-extensions-build AS pg-mooncake-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \ echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \ @@ -1183,11 +1108,8 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p # ######################################################################################### -FROM build-deps AS pg-repack-build +FROM pg-build AS pg-repack-build ARG PG_VERSION -COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ - -ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ @@ -1258,20 +1180,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ - -s install && \ - case "${PG_VERSION}" in \ - "v14" | "v15") \ - ;; \ - "v16" | "v17") \ - echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ - ;; \ - *) \ - echo "unexpected PostgreSQL version" && exit 1 \ - ;; \ - esac && \ - make -j $(getconf _NPROCESSORS_ONLN) \ - PG_CONFIG=/usr/local/pgsql/bin/pg_config \ - -C pgxn/hnsw \ -s install ######################################################################################### @@ -1288,17 +1196,6 @@ USER nonroot COPY --chown=nonroot . . RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy -######################################################################################### -# -# Final compute-tools image -# -######################################################################################### - -FROM debian:$DEBIAN_FLAVOR AS compute-tools-image - -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl -COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import - ######################################################################################### # # Layer "pgbouncer" @@ -1335,11 +1232,11 @@ RUN set -e \ # ######################################################################################### -FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter +FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter # Keep the version the same as in build-tools.Dockerfile and # test_runner/regress/test_compute_metrics.py. -FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter +FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter ######################################################################################### # diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 793ec4cf10..f554362751 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -17,7 +17,7 @@ //! //! # Local Testing //! -//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build. +//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build. //! - Build the image with the following command: //! //! ```bash diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5b82acb3a5..2fe4cd5202 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -483,7 +483,6 @@ impl LocalEnv { .iter() .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) .map(|&(_, timeline_id)| timeline_id) - .map(TimelineId::from) } pub fn timeline_name_mappings(&self) -> HashMap { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 22d2420ed4..c41ff22d15 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -822,10 +822,7 @@ impl StorageController { self.dispatch( Method::PUT, format!("control/v1/tenant/{tenant_shard_id}/migrate"), - Some(TenantShardMigrateRequest { - tenant_shard_id, - node_id, - }), + Some(TenantShardMigrateRequest { node_id }), ) .await } diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 617b2cd1ba..9d133e4af1 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -1,12 +1,16 @@ use futures::StreamExt; -use std::{str::FromStr, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + time::Duration, +}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, - TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -112,6 +116,13 @@ enum Command { #[arg(long)] node: NodeId, }, + /// Migrate the secondary location for a tenant shard to a specific pageserver. + TenantShardMigrateSecondary { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, /// Cancel any ongoing reconciliation for this shard TenantShardCancelReconcile { #[arg(long)] @@ -146,6 +157,12 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, + TenantSetPreferredAz { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + preferred_az: Option, + }, /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. TenantDrop { @@ -395,11 +412,12 @@ async fn main() -> anyhow::Result<()> { resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr)); let mut table = comfy_table::Table::new(); - table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]); for node in resp { table.add_row([ format!("{}", node.id), node.listen_http_addr, + node.availability_zone_id, format!("{:?}", node.scheduling), format!("{:?}", node.availability), ]); @@ -472,6 +490,7 @@ async fn main() -> anyhow::Result<()> { let mut table = comfy_table::Table::new(); table.set_header([ "TenantId", + "Preferred AZ", "ShardCount", "StripeSize", "Placement", @@ -481,6 +500,11 @@ async fn main() -> anyhow::Result<()> { let shard_zero = tenant.shards.into_iter().next().unwrap(); table.add_row([ format!("{}", tenant.tenant_id), + shard_zero + .preferred_az_id + .as_ref() + .cloned() + .unwrap_or("".to_string()), format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), format!("{:?}", tenant.stripe_size), format!("{:?}", tenant.policy), @@ -540,10 +564,7 @@ async fn main() -> anyhow::Result<()> { tenant_shard_id, node, } => { - let req = TenantShardMigrateRequest { - tenant_shard_id, - node_id: node, - }; + let req = TenantShardMigrateRequest { node_id: node }; storcon_client .dispatch::( @@ -553,6 +574,20 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TenantShardMigrateSecondary { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { node_id: node }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"), + Some(req), + ) + .await?; + } Command::TenantShardCancelReconcile { tenant_shard_id } => { storcon_client .dispatch::<(), ()>( @@ -596,6 +631,19 @@ async fn main() -> anyhow::Result<()> { None, ) .await?; + + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let nodes = nodes + .into_iter() + .map(|n| (n.id, n)) + .collect::>(); + println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); table.add_row(["Policy", &format!("{:?}", policy)]); @@ -604,7 +652,14 @@ async fn main() -> anyhow::Result<()> { println!("{table}"); println!("Shards:"); let mut table = comfy_table::Table::new(); - table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + table.set_header([ + "Shard", + "Attached", + "Attached AZ", + "Secondary", + "Last error", + "status", + ]); for shard in shards { let secondary = shard .node_secondary @@ -627,11 +682,18 @@ async fn main() -> anyhow::Result<()> { } let status = status_parts.join(","); + let attached_node = shard + .node_attached + .as_ref() + .map(|id| nodes.get(id).expect("Shard references nonexistent node")); + table.add_row([ format!("{}", shard.tenant_shard_id), - shard - .node_attached - .map(|n| format!("{}", n)) + attached_node + .map(|n| format!("{} ({})", n.listen_http_addr, n.id)) + .unwrap_or(String::new()), + attached_node + .map(|n| n.availability_zone_id.clone()) .unwrap_or(String::new()), secondary, shard.last_error, @@ -640,6 +702,66 @@ async fn main() -> anyhow::Result<()> { } println!("{table}"); } + Command::TenantSetPreferredAz { + tenant_id, + preferred_az, + } => { + // First learn about the tenant's shards + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + // Learn about nodes to validate the AZ ID + let nodes = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + if let Some(preferred_az) = &preferred_az { + let azs = nodes + .into_iter() + .map(|n| (n.availability_zone_id)) + .collect::>(); + if !azs.contains(preferred_az) { + anyhow::bail!( + "AZ {} not found on any node: known AZs are: {:?}", + preferred_az, + azs + ); + } + } else { + // Make it obvious to the user that since they've omitted an AZ, we're clearing it + eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + } + + // Construct a request that modifies all the tenant's shards + let req = ShardsPreferredAzsRequest { + preferred_az_ids: describe_response + .shards + .into_iter() + .map(|s| { + ( + s.tenant_shard_id, + preferred_az.clone().map(AvailabilityZone), + ) + }) + .collect(), + }; + storcon_client + .dispatch::( + Method::PUT, + "control/v1/preferred_azs".to_string(), + Some(req), + ) + .await?; + } Command::TenantWarmup { tenant_id } => { let describe_response = storcon_client .dispatch::<(), TenantDescribeResponse>( @@ -915,10 +1037,7 @@ async fn main() -> anyhow::Result<()> { .dispatch::( Method::PUT, format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), - Some(TenantShardMigrateRequest { - tenant_shard_id: mv.tenant_shard_id, - node_id: mv.to, - }), + Some(TenantShardMigrateRequest { node_id: mv.to }), ) .await .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) diff --git a/docs/docker.md b/docs/docker.md index 0914a00082..ae74c2b2ab 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,15 +7,11 @@ Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile). -And additional intermediate image: - -- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. - ## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14) +1. `neondatabase/compute-node-v17` (and -16, -v15, -v14) 2. `neondatabase/neon` diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 7eb3547183..f3880cb766 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -87,7 +87,7 @@ impl Display for AvailabilityZone { #[derive(Serialize, Deserialize)] pub struct ShardsPreferredAzsRequest { #[serde(flatten)] - pub preferred_az_ids: HashMap, + pub preferred_az_ids: HashMap>, } #[derive(Serialize, Deserialize)] @@ -144,6 +144,8 @@ pub struct NodeDescribeResponse { pub availability: NodeAvailabilityWrapper, pub scheduling: NodeSchedulingPolicy, + pub availability_zone_id: String, + pub listen_http_addr: String, pub listen_http_port: u16, @@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard { /// specifies some constraints, e.g. asking it to get off particular node(s) #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateRequest { - pub tenant_shard_id: TenantShardId, pub node_id: NodeId, } @@ -368,6 +369,16 @@ pub enum PlacementPolicy { Detached, } +impl PlacementPolicy { + pub fn want_secondaries(&self) -> usize { + match self { + PlacementPolicy::Attached(secondary_count) => *secondary_count, + PlacementPolicy::Secondary => 1, + PlacementPolicy::Detached => 0, + } + } +} + #[derive(Serialize, Deserialize, Debug)] pub struct TenantShardMigrateResponse {} diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index f0cd713c38..328dea5dec 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range { /// Non inherited range for vectored get. pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. -pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); +pub const SPARSE_RANGE: Range = Key::metadata_key_range(); impl Key { // AUX_FILES currently stores only data for logical replication (slots etc), and @@ -714,7 +714,42 @@ impl Key { // switch (and generally it likely should be optional), so ignore these. #[inline(always)] pub fn is_inherited_key(self) -> bool { - !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + if self.is_sparse() { + self.is_inherited_sparse_key() + } else { + !NON_INHERITED_RANGE.contains(&self) + } + } + + #[inline(always)] + pub fn is_sparse(self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Check if the key belongs to the inherited keyspace. + fn is_inherited_sparse_key(self) -> bool { + debug_assert!(self.is_sparse()); + self.field1 == RELATION_SIZE_PREFIX + } + + pub fn sparse_non_inherited_keyspace() -> Range { + // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace + debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX); + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } } #[inline(always)] diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 227bc19d67..2e6949e6ce 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -272,6 +272,8 @@ pub struct CompactInfoResponse { pub compact_key_range: Option, pub compact_lsn_range: Option, pub sub_compaction: bool, + pub running: bool, + pub job_id: usize, } #[derive(Serialize, Deserialize, Clone)] diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 94714359a3..50b2c69d24 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32); impl ProtocolVersion { pub const fn new(major: u16, minor: u16) -> Self { - Self((major as u32) << 16 | minor as u32) + Self(((major as u32) << 16) | minor as u32) } pub const fn minor(self) -> u16 { self.0 as u16 diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 49b1d9dc87..dae141bf77 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -43,6 +43,17 @@ impl RemoteStorageKind { } } +impl RemoteStorageConfig { + /// Helper to fetch the configured concurrency limit. + pub fn concurrency_limit(&self) -> Option { + match &self.storage { + RemoteStorageKind::LocalFs { .. } => None, + RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()), + RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()), + } + } +} + fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 5970836033..44565ee6a2 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -112,9 +112,9 @@ impl Serialize for Generation { // We should never be asked to serialize a None. Structures // that include an optional generation should convert None to an // Option::None - Err(serde::ser::Error::custom( - "Tried to serialize invalid generation ({self})", - )) + Err(serde::ser::Error::custom(format!( + "Tried to serialize invalid generation ({self:?})" + ))) } } } diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 4b4aa88d6b..9f38373ca0 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use regex::Regex; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tokio::sync::{mpsc, Mutex}; +use tokio::sync::{mpsc, Mutex, Notify}; use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; @@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A }; let seconds = match parse_query_param(&req, "seconds")? { None => 5, - Some(seconds @ 1..=30) => seconds, - Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))), + Some(seconds @ 1..=60) => seconds, + Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))), }; let frequency_hz = match parse_query_param(&req, "frequency")? { None => 99, Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))), Some(frequency) => frequency, }; - - // Only allow one profiler at a time. - static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); - let _lock = PROFILE_LOCK - .try_lock() - .map_err(|_| ApiError::Conflict("profiler already running".into()))?; + let force: bool = parse_query_param(&req, "force")?.unwrap_or_default(); // Take the profile. - let report = tokio::task::spawn_blocking(move || { + static PROFILE_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + static PROFILE_CANCEL: Lazy = Lazy::new(Notify::new); + + let report = { + // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a + // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting + // for a lock(), to avoid races where the notify isn't currently awaited. + let _lock = loop { + match PROFILE_LOCK.try_lock() { + Ok(lock) => break lock, + Err(_) if force => PROFILE_CANCEL.notify_waiters(), + Err(_) => { + return Err(ApiError::Conflict( + "profiler already running (use ?force=true to cancel it)".into(), + )) + } + } + tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait + }; + let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) - .build()?; - std::thread::sleep(Duration::from_secs(seconds)); - guard.report().build() - }) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?; + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))?; + + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(seconds)) => {}, + _ = PROFILE_CANCEL.notified() => {}, + }; + + guard + .report() + .build() + .map_err(|err| ApiError::InternalServerError(err.into()))? + }; // Return the report in the requested format. match format { diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index f188165600..c874fa30ff 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -260,7 +260,7 @@ impl FromStr for Lsn { { let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?; let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?; - Ok(Lsn((left_num as u64) << 32 | right_num as u64)) + Ok(Lsn(((left_num as u64) << 32) | right_num as u64)) } else { Err(LsnParseError) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 140b287ccc..8547746d94 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -44,6 +44,7 @@ postgres_backend.workspace = true postgres-protocol.workspace = true postgres-types.workspace = true postgres_initdb.workspace = true +pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true @@ -108,3 +109,7 @@ harness = false [[bench]] name = "bench_ingest" harness = false + +[[bench]] +name = "upload_queue" +harness = false diff --git a/pageserver/benches/upload_queue.rs b/pageserver/benches/upload_queue.rs new file mode 100644 index 0000000000..ed5daa8ae1 --- /dev/null +++ b/pageserver/benches/upload_queue.rs @@ -0,0 +1,87 @@ +//! Upload queue benchmarks. + +use std::str::FromStr as _; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use pageserver::tenant::metadata::TimelineMetadata; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask}; +use pageserver::tenant::IndexPart; +use pprof::criterion::{Output, PProfProfiler}; +use utils::generation::Generation; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_upload_queue_next_ready, +); +criterion_main!(benches); + +/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks +/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload +/// queue as a whole is thus quadratic. +/// +/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test +/// Delete and UploadMetadata instead. This is incidentally the most expensive case. +fn bench_upload_queue_next_ready(c: &mut Criterion) { + let mut g = c.benchmark_group("upload_queue_next_ready"); + for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] { + g.bench_function(format!("inprogress={inprogress}"), |b| { + run_bench(b, inprogress).unwrap() + }); + } + + fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> { + // Construct two layers. layer0 is in the indexes, layer1 will be deleted. + let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name"); + + let metadata = LayerFileMetadata { + shard: ShardIndex::new(ShardNumber(1), ShardCount(2)), + generation: Generation::Valid(1), + file_size: 0, + }; + + // Construct the (initial and uploaded) index with layer0. + let mut index = IndexPart::empty(TimelineMetadata::example()); + index.layer_metadata.insert(layer0, metadata.clone()); + + // Construct the queue. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&index, 0)?; + + // Populate inprogress_tasks with a bunch of layer1 deletions. + let delete = UploadOp::Delete(Delete { + layers: vec![(layer1, metadata)], + }); + + for task_id in 0..(inprogress as u64) { + queue.inprogress_tasks.insert( + task_id, + Arc::new(UploadTask { + task_id, + retries: AtomicU32::new(0), + op: delete.clone(), + coalesced_ops: Vec::new(), + }), + ); + } + + // Benchmark index upload scheduling. + let index_upload = UploadOp::UploadMetadata { + uploaded: Box::new(index), + }; + + b.iter(|| { + queue.queued_operations.push_front(index_upload.clone()); + assert!(queue.next_ready().is_some()); + }); + + Ok(()) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 567a69da3b..921c6a5092 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] #[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "pageserver.pid"; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 60ef4c3702..94e0b101bd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, - TimelineGcRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, + TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let res = tenant.get_scheduled_compaction_tasks(timeline_id); - let mut resp = Vec::new(); - for item in res { - resp.push(CompactInfoResponse { - compact_key_range: item.compact_key_range, - compact_lsn_range: item.compact_lsn_range, - sub_compaction: item.sub_compaction, - }); - } + let resp = tenant.get_scheduled_compaction_tasks(timeline_id); json_response(StatusCode::OK, resp) } .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9b877fc368..3c4830e3cd 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_layers_visited_per_read_global", - "Number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], - ) - .expect("failed to define a metric") -}); - pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( "pageserver_layers_visited_per_vectored_read_global", @@ -3887,7 +3878,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ - &READ_NUM_LAYERS_VISITED, &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5c337bb6bf..f6d758ad22 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -21,6 +21,7 @@ use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::StreamExt; use pageserver_api::models; +use pageserver_api::models::CompactInfoResponse; use pageserver_api::models::LsnLease; use pageserver_api::models::TimelineArchivalState; use pageserver_api::models::TimelineState; @@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{ }; use remote_timeline_client::UploadQueueNotReadyError; use std::collections::BTreeMap; -use std::collections::VecDeque; use std::fmt; use std::future::Future; use std::sync::atomic::AtomicBool; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; -use timeline::compaction::GcCompactJob; -use timeline::compaction::ScheduledCompactionTask; +use timeline::compaction::GcCompactionQueue; use timeline::import_pgdata; use timeline::offload::offload_timeline; use timeline::offload::OffloadError; -use timeline::CompactFlags; use timeline::CompactOptions; -use timeline::CompactionError; use timeline::ShutdownMode; use tokio::io::BufReader; use tokio::sync::watch; @@ -347,10 +344,8 @@ pub struct Tenant { /// Overhead of mutex is acceptable because compaction is done with a multi-second period. compaction_circuit_breaker: std::sync::Mutex, - /// Scheduled compaction tasks. Currently, this can only be populated by triggering - /// a manual gc-compaction from the manual compaction API. - scheduled_compaction_tasks: - std::sync::Mutex>>, + /// Scheduled gc-compaction tasks. + scheduled_compaction_tasks: std::sync::Mutex>>, /// If the tenant is in Activating state, notify this to encourage it /// to proceed to Active as soon as possible, rather than waiting for lazy @@ -2999,104 +2994,18 @@ impl Tenant { if has_pending_l0_compaction_task { Some(true) } else { - let mut has_pending_scheduled_compaction_task; - let next_scheduled_compaction_task = { - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) { - if !tline_pending_tasks.is_empty() { - info!( - "{} tasks left in the compaction schedule queue", - tline_pending_tasks.len() - ); - } - let next_task = tline_pending_tasks.pop_front(); - has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty(); - next_task - } else { - has_pending_scheduled_compaction_task = false; - None - } + let queue = { + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard.get(timeline_id).cloned() }; - if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task - { - if !next_scheduled_compaction_task - .options - .flags - .contains(CompactFlags::EnhancedGcBottomMostCompaction) - { - warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options); - } else if next_scheduled_compaction_task.options.sub_compaction { - info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); - let jobs: Vec = timeline - .gc_compaction_split_jobs( - GcCompactJob::from_compact_options( - next_scheduled_compaction_task.options.clone(), - ), - next_scheduled_compaction_task - .options - .sub_compaction_max_job_size_mb, - ) - .await - .map_err(CompactionError::Other)?; - if jobs.is_empty() { - info!("no jobs to run, skipping scheduled compaction task"); - } else { - has_pending_scheduled_compaction_task = true; - let jobs_len = jobs.len(); - let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(*timeline_id).or_default(); - for (idx, job) in jobs.into_iter().enumerate() { - // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` - // until we do further refactors to allow directly call `compact_with_gc`. - let mut flags: EnumSet = EnumSet::default(); - flags |= CompactFlags::EnhancedGcBottomMostCompaction; - if job.dry_run { - flags |= CompactFlags::DryRun; - } - let options = CompactOptions { - flags, - sub_compaction: false, - compact_key_range: Some(job.compact_key_range.into()), - compact_lsn_range: Some(job.compact_lsn_range.into()), - sub_compaction_max_job_size_mb: None, - }; - tline_pending_tasks.push_back(if idx == jobs_len - 1 { - ScheduledCompactionTask { - options, - // The last job in the queue sends the signal and releases the gc guard - result_tx: next_scheduled_compaction_task - .result_tx - .take(), - gc_block: next_scheduled_compaction_task - .gc_block - .take(), - } - } else { - ScheduledCompactionTask { - options, - result_tx: None, - gc_block: None, - } - }); - } - info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); - } - } else { - let _ = timeline - .compact_with_options( - cancel, - next_scheduled_compaction_task.options, - ctx, - ) - .instrument(info_span!("scheduled_compact_timeline", %timeline_id)) - .await?; - if let Some(tx) = next_scheduled_compaction_task.result_tx.take() { - // TODO: we can send compaction statistics in the future - tx.send(()).ok(); - } - } + if let Some(queue) = queue { + let has_pending_tasks = queue + .iteration(cancel, ctx, &self.gc_block, timeline) + .await?; + Some(has_pending_tasks) + } else { + Some(false) } - Some(has_pending_scheduled_compaction_task) } } else { None @@ -3126,34 +3035,32 @@ impl Tenant { } /// Cancel scheduled compaction tasks - pub(crate) fn cancel_scheduled_compaction( - &self, - timeline_id: TimelineId, - ) -> Vec { + pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) { let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) { - let current_tline_pending_tasks = std::mem::take(tline_pending_tasks); - current_tline_pending_tasks.into_iter().collect() - } else { - Vec::new() + if let Some(q) = guard.get_mut(&timeline_id) { + q.cancel_scheduled(); } } pub(crate) fn get_scheduled_compaction_tasks( &self, timeline_id: TimelineId, - ) -> Vec { - use itertools::Itertools; - let guard = self.scheduled_compaction_tasks.lock().unwrap(); - guard - .get(&timeline_id) - .map(|tline_pending_tasks| { - tline_pending_tasks - .iter() - .map(|x| x.options.clone()) - .collect_vec() - }) - .unwrap_or_default() + ) -> Vec { + let res = { + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard.get(&timeline_id).map(|q| q.remaining_jobs()) + }; + let Some((running, remaining)) = res else { + return Vec::new(); + }; + let mut result = Vec::new(); + if let Some((id, running)) = running { + result.extend(running.into_compact_info_resp(id, true)); + } + for (id, job) in remaining { + result.extend(job.into_compact_info_resp(id, false)); + } + result } /// Schedule a compaction task for a timeline. @@ -3162,20 +3069,12 @@ impl Tenant { timeline_id: TimelineId, options: CompactOptions, ) -> anyhow::Result> { - let gc_guard = match self.gc_block.start().await { - Ok(guard) => guard, - Err(e) => { - bail!("cannot run gc-compaction because gc is blocked: {}", e); - } - }; let (tx, rx) = tokio::sync::oneshot::channel(); let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); - let tline_pending_tasks = guard.entry(timeline_id).or_default(); - tline_pending_tasks.push_back(ScheduledCompactionTask { - options, - result_tx: Some(tx), - gc_block: Some(gc_guard), - }); + let q = guard + .entry(timeline_id) + .or_insert_with(|| Arc::new(GcCompactionQueue::new())); + q.schedule_manual_compaction(options, Some(tx)); Ok(rx) } @@ -5791,7 +5690,7 @@ mod tests { use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use pageserver_api::value::Value; @@ -7850,7 +7749,18 @@ mod tests { let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap(); + let base_inherited_key_child = + Key::from_hex("610000000033333333444444445500000001").unwrap(); + let base_inherited_key_nonexist = + Key::from_hex("610000000033333333444444445500000002").unwrap(); + let base_inherited_key_overwrite = + Key::from_hex("610000000033333333444444445500000003").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX); let tline = tenant .create_test_timeline_with_layers( @@ -7859,7 +7769,18 @@ mod tests { DEFAULT_PG_VERSION, &ctx, Vec::new(), // delta layers - vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + vec![( + Lsn(0x20), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a"), + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ], + )], // image layers Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN ) .await?; @@ -7873,7 +7794,18 @@ mod tests { Vec::new(), // delta layers vec![( Lsn(0x30), - vec![(base_key_child, test_img("metadata key 2"))], + vec![ + ( + base_inherited_key_child, + test_img("metadata inherited key 2"), + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a"), + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ], )], // image layers Lsn(0x30), ) @@ -7895,6 +7827,26 @@ mod tests { get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 1a")) + ); // test vectored get on child timeline assert_eq!( @@ -7909,6 +7861,82 @@ mod tests { get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, None ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?, + Some(test_img("metadata inherited key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?, + Some(test_img("metadata inherited key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2b")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?, + Some(test_img("metadata key overwrite 2a")) + ); + + // test vectored scan on parent timeline + let mut reconstruct_state = ValuesReconstructState::new(); + let res = tline + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 1a") + ), + (base_key, test_img("metadata key 1")), + (base_key_overwrite, test_img("metadata key overwrite 1b")), + ] + ); + + // test vectored scan on child timeline + let mut reconstruct_state = ValuesReconstructState::new(); + let res = child + .get_vectored_impl( + KeySpace::single(Key::metadata_key_range()), + lsn, + &mut reconstruct_state, + &ctx, + ) + .await?; + + assert_eq!( + res.into_iter() + .map(|(k, v)| (k, v.unwrap())) + .collect::>(), + vec![ + (base_inherited_key, test_img("metadata inherited key 1")), + ( + base_inherited_key_child, + test_img("metadata inherited key 2") + ), + ( + base_inherited_key_overwrite, + test_img("metadata key overwrite 2a") + ), + (base_key_child, test_img("metadata key 2")), + (base_key_overwrite, test_img("metadata key overwrite 2b")), + ] + ); Ok(()) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index d54dded778..edf2e6a3aa 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -11,7 +11,7 @@ pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig}; +use pageserver_api::models::{self, TenantConfigPatch}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; @@ -597,7 +597,7 @@ impl From for models::TenantConfig { .map(humantime), heatmap_period: value.heatmap_period.map(humantime), lazy_slru_download: value.lazy_slru_download, - timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, lsn_lease_length: value.lsn_lease_length.map(humantime), lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index c77342b144..bb9df020b5 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -84,17 +84,17 @@ impl Value { fn to_u64(self) -> u64 { let b = &self.0; - (b[0] as u64) << 32 - | (b[1] as u64) << 24 - | (b[2] as u64) << 16 - | (b[3] as u64) << 8 + ((b[0] as u64) << 32) + | ((b[1] as u64) << 24) + | ((b[2] as u64) << 16) + | ((b[3] as u64) << 8) | b[4] as u64 } fn to_blknum(self) -> u32 { let b = &self.0; assert!(b[0] == 0x80); - (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32 + ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32 } } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 24440d4b35..d281eb305f 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -320,7 +320,6 @@ impl TimelineMetadata { // Checksums make it awkward to build a valid instance by hand. This helper // provides a TimelineMetadata with a valid checksum in its header. - #[cfg(test)] pub fn example() -> Self { let instance = Self::new( "0/16960E8".parse::().unwrap(), diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 813111245d..47c4a8637d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -63,22 +63,18 @@ //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as //! described above. +//! //! From the user's perspective, the operations are executed sequentially. //! Internally, the client knows which operations can be performed in parallel, //! and which operations act like a "barrier" that require preceding operations //! to finish. The calling code just needs to call the schedule-functions in the //! correct order, and the client will parallelize the operations in a way that -//! is safe. -//! -//! The caller should be careful with deletion, though. They should not delete -//! local files that have been scheduled for upload but not yet finished uploading. -//! Otherwise the upload will fail. To wait for an upload to finish, use -//! the 'wait_completion' function (more on that later.) +//! is safe. For more details, see `UploadOp::can_bypass`. //! //! All of this relies on the following invariants: //! //! - We rely on read-after write consistency in the remote storage. -//! - Layer files are immutable +//! - Layer files are immutable. //! //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote //! storage. Different tenants can be attached to different pageservers, but if the @@ -429,8 +425,16 @@ impl RemoteTimelineClient { /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); info!( "initialized upload queue from remote index with {} layer files", @@ -445,8 +449,16 @@ impl RemoteTimelineClient { &self, local_metadata: &TimelineMetadata, ) -> anyhow::Result<()> { + // Set the maximum number of inprogress tasks to the remote storage concurrency. There's + // certainly no point in starting more upload tasks than this. + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_empty_remote(local_metadata)?; + upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?; self.update_remote_physical_size_gauge(None); info!("initialized upload queue as empty"); Ok(()) @@ -462,9 +474,15 @@ impl RemoteTimelineClient { let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!( "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; + let inprogress_limit = self + .conf + .remote_storage_config + .as_ref() + .and_then(|r| r.concurrency_limit()) + .unwrap_or(0); let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; + upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?; self.update_remote_physical_size_gauge(Some(index_part)); self.stop_impl(&mut upload_queue); @@ -1855,57 +1873,17 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Pick next tasks from the queue, and start as many of them as possible without violating /// the ordering constraints. /// - /// The caller needs to already hold the `upload_queue` lock. + /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does. + /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has + /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks. fn launch_queued_tasks(self: &Arc, upload_queue: &mut UploadQueueInitialized) { - while let Some(next_op) = upload_queue.queued_operations.front() { - // Can we run this task now? - let can_run_now = match next_op { - UploadOp::UploadLayer(..) => { - // Can always be scheduled. - true - } - UploadOp::UploadMetadata { .. } => { - // These can only be performed after all the preceding operations - // have finished. - upload_queue.inprogress_tasks.is_empty() - } - UploadOp::Delete(..) => { - // Wait for preceding uploads to finish. Concurrent deletions are OK, though. - upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len() - } + while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() { + debug!("starting op: {next_op}"); - UploadOp::Barrier(_) | UploadOp::Shutdown => { - upload_queue.inprogress_tasks.is_empty() - } - }; - - // If we cannot launch this task, don't look any further. - // - // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch - // them now, but we don't try to do that currently. For example, if the frontmost task - // is an index-file upload that cannot proceed until preceding uploads have finished, we - // could still start layer uploads that were scheduled later. - if !can_run_now { - break; - } - - if let UploadOp::Shutdown = next_op { - // leave the op in the queue but do not start more tasks; it will be dropped when - // the stop is called. - upload_queue.shutdown_ready.close(); - break; - } - - // We can launch this task. Remove it from the queue first. - let mut next_op = upload_queue.queued_operations.pop_front().unwrap(); - - debug!("starting op: {}", next_op); - - // Update the counters and prepare + // Prepare upload. match &mut next_op { UploadOp::UploadLayer(layer, meta, mode) => { if upload_queue @@ -1916,18 +1894,14 @@ impl RemoteTimelineClient { } else { *mode = Some(OpType::MayReorder) } - upload_queue.num_inprogress_layer_uploads += 1; - } - UploadOp::UploadMetadata { .. } => { - upload_queue.num_inprogress_metadata_uploads += 1; } + UploadOp::UploadMetadata { .. } => {} UploadOp::Delete(Delete { layers }) => { for (name, meta) in layers { upload_queue .recently_deleted .insert((name.clone(), meta.generation)); } - upload_queue.num_inprogress_deletions += 1; } UploadOp::Barrier(sender) => { sender.send_replace(()); @@ -1944,6 +1918,7 @@ impl RemoteTimelineClient { let task = Arc::new(UploadTask { task_id: upload_task_id, op: next_op, + coalesced_ops, retries: AtomicU32::new(0), }); upload_queue @@ -2027,6 +2002,8 @@ impl RemoteTimelineClient { let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => { + // TODO: check if this mechanism can be removed now that can_bypass() performs + // conflict checks during scheduling. if let Some(OpType::FlushDeletion) = mode { if self.config.read().unwrap().block_deletions { // Of course, this is not efficient... but usually the queue should be empty. @@ -2249,13 +2226,8 @@ impl RemoteTimelineClient { upload_queue.inprogress_tasks.remove(&task.task_id); let lsn_update = match task.op { - UploadOp::UploadLayer(_, _, _) => { - upload_queue.num_inprogress_layer_uploads -= 1; - None - } + UploadOp::UploadLayer(_, _, _) => None, UploadOp::UploadMetadata { ref uploaded } => { - upload_queue.num_inprogress_metadata_uploads -= 1; - // the task id is reused as a monotonicity check for storing the "clean" // IndexPart. let last_updater = upload_queue.clean.1; @@ -2289,10 +2261,7 @@ impl RemoteTimelineClient { None } } - UploadOp::Delete(_) => { - upload_queue.num_inprogress_deletions -= 1; - None - } + UploadOp::Delete(_) => None, UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(), }; @@ -2317,6 +2286,9 @@ impl RemoteTimelineClient { } self.metric_end(&task.op); + for coalesced_op in &task.coalesced_ops { + self.metric_end(coalesced_op); + } } fn metric_impl( @@ -2409,6 +2381,7 @@ impl RemoteTimelineClient { // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point. // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { + inprogress_limit: initialized.inprogress_limit, task_counter: 0, dirty: initialized.dirty.clone(), clean: initialized.clean.clone(), @@ -2416,9 +2389,6 @@ impl RemoteTimelineClient { visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::default(), queued_operations: VecDeque::default(), #[cfg(feature = "testing")] @@ -2445,14 +2415,6 @@ impl RemoteTimelineClient { } }; - // consistency check - assert_eq!( - qi.num_inprogress_layer_uploads - + qi.num_inprogress_metadata_uploads - + qi.num_inprogress_deletions, - qi.inprogress_tasks.len() - ); - // We don't need to do anything here for in-progress tasks. They will finish // on their own, decrement the unfinished-task counter themselves, and observe // that the queue is Stopped. @@ -2899,8 +2861,8 @@ mod tests { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.is_empty()); - assert!(upload_queue.inprogress_tasks.len() == 2); - assert!(upload_queue.num_inprogress_layer_uploads == 2); + assert_eq!(upload_queue.inprogress_tasks.len(), 2); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2); // also check that `latest_file_changes` was updated assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); @@ -2970,8 +2932,8 @@ mod tests { // Deletion schedules upload of the index file, and the file deletion itself assert_eq!(upload_queue.queued_operations.len(), 2); assert_eq!(upload_queue.inprogress_tasks.len(), 1); - assert_eq!(upload_queue.num_inprogress_layer_uploads, 1); - assert_eq!(upload_queue.num_inprogress_deletions, 0); + assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1); + assert_eq!(upload_queue.num_inprogress_deletions(), 0); assert_eq!( upload_queue.latest_files_changes_since_metadata_upload_scheduled, 0 diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 51f093cb87..244be5bbb7 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -104,7 +104,7 @@ impl IndexPart { pub const FILE_NAME: &'static str = "index_part.json"; - pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + pub fn empty(metadata: TimelineMetadata) -> Self { IndexPart { version: Self::LATEST_VERSION, layer_metadata: Default::default(), diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index b8206fca5a..3913637ca0 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -12,7 +12,7 @@ pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; use bytes::Bytes; -use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE}; +use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; @@ -209,7 +209,7 @@ impl ValuesReconstructState { .keys .entry(*key) .or_insert(Ok(VectoredValueReconstructState::default())); - let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key); + let is_sparse_key = key.is_sparse(); if let Ok(state) = state { let key_done = match state.situation { ValueReconstructSituation::Complete => { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 71e53da20f..2b67f55a17 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { /// /// Layout: /// - 1 bit: `will_init` -/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` -/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos` #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct IndexEntry(u64); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 8933e8ceb1..2b06c88e8b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1812,7 +1812,7 @@ enum LayerKind { /// Guard for forcing a layer be resident while it exists. #[derive(Clone)] -pub(crate) struct ResidentLayer { +pub struct ResidentLayer { owner: Layer, downloaded: Arc, } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e222a624de..4aa6b7a05a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -27,7 +27,7 @@ use pageserver_api::{ config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD, key::{ KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE, - NON_INHERITED_SPARSE_RANGE, + SPARSE_RANGE, }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ @@ -3221,7 +3221,7 @@ impl Timeline { // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. keyspace.remove_overlapping_with(&KeySpace { - ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved @@ -3242,7 +3242,11 @@ impl Timeline { // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. - let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + // Do not fire missing key error for sparse keys. + removed.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); if !removed.is_empty() { break Some(removed); } @@ -3257,6 +3261,21 @@ impl Timeline { timeline = &*timeline_owned; }; + // Remove sparse keys from the keyspace so that it doesn't fire errors. + let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace { + let mut missing_keyspace = missing_keyspace; + missing_keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![SPARSE_RANGE], + }); + if missing_keyspace.is_empty() { + None + } else { + Some(missing_keyspace) + } + } else { + None + }; + if let Some(missing_keyspace) = missing_keyspace { return Err(GetVectoredError::MissingKey(MissingKeyError { key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ @@ -3762,36 +3781,35 @@ impl Timeline { return Err(FlushLayerError::Cancelled); } - let mut layers_to_upload = Vec::new(); - layers_to_upload.extend( - self.create_image_layers( - &rel_partition, - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace. + // So that the key ranges don't overlap. + let mut partitions = KeyPartitioning::default(); + partitions.parts.extend(rel_partition.parts); if !metadata_partition.parts.is_empty() { assert_eq!( metadata_partition.parts.len(), 1, "currently sparse keyspace should only contain a single metadata keyspace" ); - layers_to_upload.extend( - self.create_image_layers( - // Safety: create_image_layers treat sparse keyspaces differently that it does not scan - // every single key within the keyspace, and therefore, it's safe to force converting it - // into a dense keyspace before calling this function. - &metadata_partition.into_dense(), - self.initdb_lsn, - ImageLayerCreationMode::Initial, - ctx, - ) - .await?, - ); + // Safety: create_image_layers treat sparse keyspaces differently that it does not scan + // every single key within the keyspace, and therefore, it's safe to force converting it + // into a dense keyspace before calling this function. + partitions + .parts + .extend(metadata_partition.into_dense().parts); } + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &partitions, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + (layers_to_upload, None) } else { // Normal case, write out a L0 delta layer file. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 55cde8603e..05f8d476f9 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -4,7 +4,7 @@ //! //! The old legacy algorithm is implemented directly in `timeline.rs`. -use std::collections::{BinaryHeap, HashMap, HashSet}; +use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; @@ -16,10 +16,12 @@ use super::{ use anyhow::{anyhow, bail, Context}; use bytes::Bytes; +use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; +use pageserver_api::models::CompactInfoResponse; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; @@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder} use crate::page_cache; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; +use crate::tenant::gc_block::GcBlock; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -63,16 +66,284 @@ use super::CompactionError; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; -/// A scheduled compaction task. -pub(crate) struct ScheduledCompactionTask { - /// It's unfortunate that we need to store a compact options struct here because the only outer - /// API we can call here is `compact_with_options` which does a few setup calls before starting the - /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future. - pub options: CompactOptions, - /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender. - pub result_tx: Option>, - /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard. - pub gc_block: Option, +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +pub struct GcCompactionJobId(pub usize); + +impl std::fmt::Display for GcCompactionJobId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +#[derive(Debug, Clone)] +pub enum GcCompactionQueueItem { + Manual(CompactOptions), + SubCompactionJob(CompactOptions), + #[allow(dead_code)] + UpdateL2Lsn(Lsn), + Notify(GcCompactionJobId), +} + +impl GcCompactionQueueItem { + pub fn into_compact_info_resp( + self, + id: GcCompactionJobId, + running: bool, + ) -> Option { + match self { + GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse { + compact_key_range: options.compact_key_range, + compact_lsn_range: options.compact_lsn_range, + sub_compaction: options.sub_compaction, + running, + job_id: id.0, + }), + GcCompactionQueueItem::UpdateL2Lsn(_) => None, + GcCompactionQueueItem::Notify(_) => None, + } + } +} + +struct GcCompactionQueueInner { + running: Option<(GcCompactionJobId, GcCompactionQueueItem)>, + queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + notify: HashMap>, + gc_guards: HashMap, + last_id: GcCompactionJobId, +} + +impl GcCompactionQueueInner { + fn next_id(&mut self) -> GcCompactionJobId { + let id = self.last_id; + self.last_id = GcCompactionJobId(id.0 + 1); + id + } +} + +/// A structure to store gc_compaction jobs. +pub struct GcCompactionQueue { + /// All items in the queue, and the currently-running job. + inner: std::sync::Mutex, + /// Ensure only one thread is consuming the queue. + consumer_lock: tokio::sync::Mutex<()>, +} + +impl GcCompactionQueue { + pub fn new() -> Self { + GcCompactionQueue { + inner: std::sync::Mutex::new(GcCompactionQueueInner { + running: None, + queued: VecDeque::new(), + notify: HashMap::new(), + gc_guards: HashMap::new(), + last_id: GcCompactionJobId(0), + }), + consumer_lock: tokio::sync::Mutex::new(()), + } + } + + pub fn cancel_scheduled(&self) { + let mut guard = self.inner.lock().unwrap(); + guard.queued.clear(); + guard.notify.clear(); + guard.gc_guards.clear(); + } + + /// Schedule a manual compaction job. + pub fn schedule_manual_compaction( + &self, + options: CompactOptions, + notify: Option>, + ) -> GcCompactionJobId { + let mut guard = self.inner.lock().unwrap(); + let id = guard.next_id(); + guard + .queued + .push_back((id, GcCompactionQueueItem::Manual(options))); + if let Some(notify) = notify { + guard.notify.insert(id, notify); + } + info!("scheduled compaction job id={}", id); + id + } + + /// Trigger an auto compaction. + #[allow(dead_code)] + pub fn trigger_auto_compaction(&self, _: &Arc) {} + + /// Notify the caller the job has finished and unblock GC. + fn notify_and_unblock(&self, id: GcCompactionJobId) { + info!("compaction job id={} finished", id); + let mut guard = self.inner.lock().unwrap(); + if let Some(blocking) = guard.gc_guards.remove(&id) { + drop(blocking) + } + if let Some(tx) = guard.notify.remove(&id) { + let _ = tx.send(()); + } + } + + async fn handle_sub_compaction( + &self, + id: GcCompactionJobId, + options: CompactOptions, + timeline: &Arc, + gc_block: &GcBlock, + ) -> Result<(), CompactionError> { + info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"); + let jobs: Vec = timeline + .gc_compaction_split_jobs( + GcCompactJob::from_compact_options(options.clone()), + options.sub_compaction_max_job_size_mb, + ) + .await + .map_err(CompactionError::Other)?; + if jobs.is_empty() { + info!("no jobs to run, skipping scheduled compaction task"); + self.notify_and_unblock(id); + } else { + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + + let jobs_len = jobs.len(); + let mut pending_tasks = Vec::new(); + for job in jobs { + // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` + // until we do further refactors to allow directly call `compact_with_gc`. + let mut flags: EnumSet = EnumSet::default(); + flags |= CompactFlags::EnhancedGcBottomMostCompaction; + if job.dry_run { + flags |= CompactFlags::DryRun; + } + let options = CompactOptions { + flags, + sub_compaction: false, + compact_key_range: Some(job.compact_key_range.into()), + compact_lsn_range: Some(job.compact_lsn_range.into()), + sub_compaction_max_job_size_mb: None, + }; + pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); + } + pending_tasks.push(GcCompactionQueueItem::Notify(id)); + { + let mut guard = self.inner.lock().unwrap(); + guard.gc_guards.insert(id, gc_guard); + let mut tasks = Vec::new(); + for task in pending_tasks { + let id = guard.next_id(); + tasks.push((id, task)); + } + tasks.reverse(); + for item in tasks { + guard.queued.push_front(item); + } + } + info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); + } + Ok(()) + } + + /// Take a job from the queue and process it. Returns if there are still pending tasks. + pub async fn iteration( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + gc_block: &GcBlock, + timeline: &Arc, + ) -> Result { + let _one_op_at_a_time_guard = self.consumer_lock.lock().await; + let has_pending_tasks; + let (id, item) = { + let mut guard = self.inner.lock().unwrap(); + let Some((id, item)) = guard.queued.pop_front() else { + return Ok(false); + }; + guard.running = Some((id, item.clone())); + has_pending_tasks = !guard.queued.is_empty(); + (id, item) + }; + + match item { + GcCompactionQueueItem::Manual(options) => { + if !options + .flags + .contains(CompactFlags::EnhancedGcBottomMostCompaction) + { + warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options); + } else if options.sub_compaction { + self.handle_sub_compaction(id, options, timeline, gc_block) + .await?; + } else { + let gc_guard = match gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + return Err(CompactionError::Other(anyhow!( + "cannot run gc-compaction because gc is blocked: {}", + e + ))); + } + }; + { + let mut guard = self.inner.lock().unwrap(); + guard.gc_guards.insert(id, gc_guard); + } + let _ = timeline + .compact_with_options(cancel, options, ctx) + .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) + .await?; + self.notify_and_unblock(id); + } + } + GcCompactionQueueItem::SubCompactionJob(options) => { + let _ = timeline + .compact_with_options(cancel, options, ctx) + .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id)) + .await?; + } + GcCompactionQueueItem::Notify(id) => { + self.notify_and_unblock(id); + } + GcCompactionQueueItem::UpdateL2Lsn(_) => { + unreachable!() + } + } + { + let mut guard = self.inner.lock().unwrap(); + guard.running = None; + } + Ok(has_pending_tasks) + } + + #[allow(clippy::type_complexity)] + pub fn remaining_jobs( + &self, + ) -> ( + Option<(GcCompactionJobId, GcCompactionQueueItem)>, + VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>, + ) { + let guard = self.inner.lock().unwrap(); + (guard.running.clone(), guard.queued.clone()) + } + + #[allow(dead_code)] + pub fn remaining_jobs_num(&self) -> usize { + let guard = self.inner.lock().unwrap(); + guard.queued.len() + if guard.running.is_some() { 1 } else { 0 } + } } /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index d74faa1af5..3a8796add8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection( // need to advance last record LSN on all shards. If we've not ingested the latest // record, then set the LSN of the modification past it. This way all shards // advance their last record LSN at the same time. - let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) { + let needs_last_record_lsn_advance = match next_record_lsn { Some(lsn) if lsn > modification.get_lsn() => { modification.set_lsn(lsn).unwrap(); true diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index ef3aa759f3..d302205ffe 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,28 +1,38 @@ +use std::collections::{HashMap, HashSet, VecDeque}; +use std::fmt::Debug; +use std::sync::atomic::AtomicU32; +use std::sync::Arc; + +use super::remote_timeline_client::is_same_remote_layer_path; +use super::storage_layer::AsLayerDesc as _; use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::index::LayerFileMetadata; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; -use std::fmt::Debug; +use utils::generation::Generation; +use utils::lsn::{AtomicLsn, Lsn}; use chrono::NaiveDateTime; -use std::sync::Arc; +use once_cell::sync::Lazy; use tracing::info; -use utils::lsn::AtomicLsn; -use std::sync::atomic::AtomicU32; -use utils::lsn::Lsn; +/// Kill switch for upload queue reordering in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_REORDERING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_REORDERING").as_deref() == Ok("true")); -use utils::generation::Generation; +/// Kill switch for index upload coalescing in case it causes problems. +/// TODO: remove this once we have confidence in it. +static DISABLE_UPLOAD_QUEUE_INDEX_COALESCING: Lazy = + Lazy::new(|| std::env::var("DISABLE_UPLOAD_QUEUE_INDEX_COALESCING").as_deref() == Ok("true")); // clippy warns that Uninitialized is much smaller than Initialized, which wastes // memory for Uninitialized variants. Doesn't matter in practice, there are not // that many upload queues in a running pageserver, and most of them are initialized // anyway. #[allow(clippy::large_enum_variant)] -pub(super) enum UploadQueue { +pub enum UploadQueue { Uninitialized, Initialized(UploadQueueInitialized), Stopped(UploadQueueStopped), @@ -39,13 +49,16 @@ impl UploadQueue { } #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] -pub(crate) enum OpType { +pub enum OpType { MayReorder, FlushDeletion, } /// This keeps track of queued and in-progress tasks. -pub(crate) struct UploadQueueInitialized { +pub struct UploadQueueInitialized { + /// Maximum number of inprogress tasks to schedule. 0 is no limit. + pub(crate) inprogress_limit: usize, + /// Counter to assign task IDs pub(crate) task_counter: u64, @@ -70,21 +83,16 @@ pub(crate) struct UploadQueueInitialized { /// we skip validation) pub(crate) visible_remote_consistent_lsn: Arc, - // Breakdown of different kinds of tasks currently in-progress - pub(crate) num_inprogress_layer_uploads: usize, - pub(crate) num_inprogress_metadata_uploads: usize, - pub(crate) num_inprogress_deletions: usize, - /// Tasks that are currently in-progress. In-progress means that a tokio Task /// has been launched for it. An in-progress task can be busy uploading, but it can /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can /// be waiting for retry in `exponential_backoff`. - pub(crate) inprogress_tasks: HashMap>, + pub inprogress_tasks: HashMap>, /// Queued operations that have not been launched yet. They might depend on previous /// tasks to finish. For example, metadata upload cannot be performed before all /// preceding layer file uploads have completed. - pub(crate) queued_operations: VecDeque, + pub queued_operations: VecDeque, /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around /// for error logging. @@ -122,6 +130,167 @@ impl UploadQueueInitialized { let lsn = self.clean.0.metadata.disk_consistent_lsn(); self.clean.1.map(|_| lsn) } + + /// Returns and removes the next ready operation from the queue, if any. This isn't necessarily + /// the first operation in the queue, to avoid head-of-line blocking -- an operation can jump + /// the queue if it doesn't conflict with operations ahead of it. + /// + /// Also returns any operations that were coalesced into this one, e.g. multiple index uploads. + /// + /// None may be returned even if the queue isn't empty, if no operations are ready yet. + /// + /// NB: this is quadratic, but queues are expected to be small, and bounded by inprogress_limit. + pub fn next_ready(&mut self) -> Option<(UploadOp, Vec)> { + // If inprogress_tasks is already at limit, don't schedule anything more. + if self.inprogress_limit > 0 && self.inprogress_tasks.len() >= self.inprogress_limit { + return None; + } + + for (i, candidate) in self.queued_operations.iter().enumerate() { + // If this candidate is ready, go for it. Otherwise, try the next one. + if self.is_ready(i) { + // Shutdown operations are left at the head of the queue, to prevent further + // operations from starting. Signal that we're ready to shut down. + if matches!(candidate, UploadOp::Shutdown) { + assert!(self.inprogress_tasks.is_empty(), "shutdown with tasks"); + assert_eq!(i, 0, "shutdown not at head of queue"); + self.shutdown_ready.close(); + return None; + } + + let mut op = self.queued_operations.remove(i).expect("i can't disappear"); + + // Coalesce any back-to-back index uploads by only uploading the newest one that's + // ready. This typically happens with layer/index/layer/index/... sequences, where + // the layers bypass the indexes, leaving the indexes queued. + // + // If other operations are interleaved between index uploads we don't try to + // coalesce them, since we may as well update the index concurrently with them. + // This keeps the index fresh and avoids starvation. + // + // NB: we assume that all uploaded indexes have the same remote path. This + // is true at the time of writing: the path only depends on the tenant, + // timeline and generation, all of which are static for a timeline instance. + // Otherwise, we must be careful not to coalesce different paths. + let mut coalesced_ops = Vec::new(); + if matches!(op, UploadOp::UploadMetadata { .. }) { + while let Some(UploadOp::UploadMetadata { .. }) = self.queued_operations.get(i) + { + if *DISABLE_UPLOAD_QUEUE_INDEX_COALESCING { + break; + } + if !self.is_ready(i) { + break; + } + coalesced_ops.push(op); + op = self.queued_operations.remove(i).expect("i can't disappear"); + } + } + + return Some((op, coalesced_ops)); + } + + // Nothing can bypass a barrier or shutdown. If it wasn't scheduled above, give up. + if matches!(candidate, UploadOp::Barrier(_) | UploadOp::Shutdown) { + return None; + } + + // If upload queue reordering is disabled, bail out after the first operation. + if *DISABLE_UPLOAD_QUEUE_REORDERING { + return None; + } + } + None + } + + /// Returns true if the queued operation at the given position is ready to be uploaded, i.e. if + /// it doesn't conflict with any in-progress or queued operations ahead of it. Operations are + /// allowed to skip the queue when it's safe to do so, to increase parallelism. + /// + /// The position must be valid for the queue size. + fn is_ready(&self, pos: usize) -> bool { + let candidate = self.queued_operations.get(pos).expect("invalid position"); + self + // Look at in-progress operations, in random order. + .inprogress_tasks + .values() + .map(|task| &task.op) + // Then queued operations ahead of the candidate, front-to-back. + .chain(self.queued_operations.iter().take(pos)) + // Keep track of the active index ahead of each operation. This is used to ensure that + // an upload doesn't skip the queue too far, such that it modifies a layer that's + // referenced by an active index. + // + // It's okay that in-progress operations are emitted in random order above, since at + // most one of them can be an index upload (enforced by can_bypass). + .scan(&self.clean.0, |next_active_index, op| { + let active_index = *next_active_index; + if let UploadOp::UploadMetadata { ref uploaded } = op { + *next_active_index = uploaded; // stash index for next operation after this + } + Some((op, active_index)) + }) + // Check if the candidate can bypass all of them. + .all(|(op, active_index)| candidate.can_bypass(op, active_index)) + } + + /// Returns the number of in-progress deletion operations. + #[cfg(test)] + pub(crate) fn num_inprogress_deletions(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::Delete(_))) + .count() + } + + /// Returns the number of in-progress layer uploads. + #[cfg(test)] + pub(crate) fn num_inprogress_layer_uploads(&self) -> usize { + self.inprogress_tasks + .iter() + .filter(|(_, t)| matches!(t.op, UploadOp::UploadLayer(_, _, _))) + .count() + } + + /// Test helper that schedules all ready operations into inprogress_tasks, and returns + /// references to them. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn schedule_ready(&mut self) -> Vec> { + let mut tasks = Vec::new(); + // NB: schedule operations one by one, to handle conflicts with inprogress_tasks. + while let Some((op, coalesced_ops)) = self.next_ready() { + self.task_counter += 1; + let task = Arc::new(UploadTask { + task_id: self.task_counter, + op, + coalesced_ops, + retries: 0.into(), + }); + self.inprogress_tasks.insert(task.task_id, task.clone()); + tasks.push(task); + } + tasks + } + + /// Test helper that marks an operation as completed, removing it from inprogress_tasks. + /// + /// TODO: the corresponding production logic should be moved from RemoteTimelineClient into + /// UploadQueue, so we can use the same code path. + #[cfg(test)] + fn complete(&mut self, task_id: u64) { + let Some(task) = self.inprogress_tasks.remove(&task_id) else { + return; + }; + // Update the clean index on uploads. + if let UploadOp::UploadMetadata { ref uploaded } = task.op { + if task.task_id > self.clean.1.unwrap_or_default() { + self.clean = (*uploaded.clone(), Some(task.task_id)); + } + } + } } #[derive(Clone, Copy)] @@ -131,12 +300,12 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStoppedDeletable { +pub struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } -pub(super) enum UploadQueueStopped { +pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, } @@ -163,9 +332,10 @@ impl NotInitialized { } impl UploadQueue { - pub(crate) fn initialize_empty_remote( + pub fn initialize_empty_remote( &mut self, metadata: &TimelineMetadata, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -179,15 +349,13 @@ impl UploadQueue { let index_part = IndexPart::empty(metadata.clone()); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -202,9 +370,10 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialize_with_current_remote_index_part( + pub fn initialize_with_current_remote_index_part( &mut self, index_part: &IndexPart, + inprogress_limit: usize, ) -> anyhow::Result<&mut UploadQueueInitialized> { match self { UploadQueue::Uninitialized => (), @@ -219,6 +388,7 @@ impl UploadQueue { ); let state = UploadQueueInitialized { + inprogress_limit, dirty: index_part.clone(), clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, @@ -227,9 +397,6 @@ impl UploadQueue { ), // what follows are boring default initializations task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, inprogress_tasks: HashMap::new(), queued_operations: VecDeque::new(), #[cfg(feature = "testing")] @@ -244,9 +411,7 @@ impl UploadQueue { Ok(self.initialized_mut().expect("we just set it")) } - pub(crate) fn initialized_mut( - &mut self, - ) -> Result<&mut UploadQueueInitialized, NotInitialized> { + pub fn initialized_mut(&mut self) -> Result<&mut UploadQueueInitialized, NotInitialized> { use UploadQueue::*; match self { Uninitialized => Err(NotInitialized::Uninitialized), @@ -276,23 +441,27 @@ impl UploadQueue { /// An in-progress upload or delete task. #[derive(Debug)] -pub(crate) struct UploadTask { +pub struct UploadTask { /// Unique ID of this task. Used as the key in `inprogress_tasks` above. - pub(crate) task_id: u64, - pub(crate) retries: AtomicU32, - - pub(crate) op: UploadOp, + pub task_id: u64, + /// Number of task retries. + pub retries: AtomicU32, + /// The upload operation. + pub op: UploadOp, + /// Any upload operations that were coalesced into this operation. This typically happens with + /// back-to-back index uploads, see `UploadQueueInitialized::next_ready()`. + pub coalesced_ops: Vec, } /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug, Clone)] -pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, +pub struct Delete { + pub layers: Vec<(LayerName, LayerFileMetadata)>, } -#[derive(Debug)] -pub(crate) enum UploadOp { +#[derive(Clone, Debug)] +pub enum UploadOp { /// Upload a layer file. The last field indicates the last operation for thie file. UploadLayer(ResidentLayer, LayerFileMetadata, Option), @@ -338,3 +507,796 @@ impl std::fmt::Display for UploadOp { } } } + +impl UploadOp { + /// Returns true if self can bypass other, i.e. if the operations don't conflict. index is the + /// active index when other would be uploaded -- if we allow self to bypass other, this would + /// be the active index when self is uploaded. + pub fn can_bypass(&self, other: &UploadOp, index: &IndexPart) -> bool { + match (self, other) { + // Nothing can bypass a barrier or shutdown, and it can't bypass anything. + (UploadOp::Barrier(_), _) | (_, UploadOp::Barrier(_)) => false, + (UploadOp::Shutdown, _) | (_, UploadOp::Shutdown) => false, + + // Uploads and deletes can bypass each other unless they're for the same file. + (UploadOp::UploadLayer(a, ameta, _), UploadOp::UploadLayer(b, bmeta, _)) => { + let aname = &a.layer_desc().layer_name(); + let bname = &b.layer_desc().layer_name(); + !is_same_remote_layer_path(aname, ameta, bname, bmeta) + } + (UploadOp::UploadLayer(u, umeta, _), UploadOp::Delete(d)) + | (UploadOp::Delete(d), UploadOp::UploadLayer(u, umeta, _)) => { + d.layers.iter().all(|(dname, dmeta)| { + !is_same_remote_layer_path(&u.layer_desc().layer_name(), umeta, dname, dmeta) + }) + } + + // Deletes are idempotent and can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => true, + + // Uploads and deletes can bypass an index upload as long as neither the uploaded index + // nor the active index below it references the file. A layer can't be modified or + // deleted while referenced by an index. + // + // Similarly, index uploads can bypass uploads and deletes as long as neither the + // uploaded index nor the active index references the file (the latter would be + // incorrect use by the caller). + (UploadOp::UploadLayer(u, umeta, _), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::UploadLayer(u, umeta, _)) => { + let uname = u.layer_desc().layer_name(); + !i.references(&uname, umeta) && !index.references(&uname, umeta) + } + (UploadOp::Delete(d), UploadOp::UploadMetadata { uploaded: i }) + | (UploadOp::UploadMetadata { uploaded: i }, UploadOp::Delete(d)) => { + d.layers.iter().all(|(dname, dmeta)| { + !i.references(dname, dmeta) && !index.references(dname, dmeta) + }) + } + + // Indexes can never bypass each other. They can coalesce though, and + // `UploadQueue::next_ready()` currently does this when possible. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => false, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::tenant::storage_layer::layer::local_layer_path; + use crate::tenant::storage_layer::Layer; + use crate::tenant::Timeline; + use crate::DEFAULT_PG_VERSION; + use itertools::Itertools as _; + use std::str::FromStr as _; + use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + + /// Test helper which asserts that two operations are the same, in lieu of UploadOp PartialEq. + #[track_caller] + fn assert_same_op(a: &UploadOp, b: &UploadOp) { + use UploadOp::*; + match (a, b) { + (UploadLayer(a, ameta, atype), UploadLayer(b, bmeta, btype)) => { + assert_eq!(a.layer_desc().layer_name(), b.layer_desc().layer_name()); + assert_eq!(ameta, bmeta); + assert_eq!(atype, btype); + } + (Delete(a), Delete(b)) => assert_eq!(a.layers, b.layers), + (UploadMetadata { uploaded: a }, UploadMetadata { uploaded: b }) => assert_eq!(a, b), + (Barrier(_), Barrier(_)) => {} + (Shutdown, Shutdown) => {} + (a, b) => panic!("{a:?} != {b:?}"), + } + } + + /// Test helper which asserts that two sets of operations are the same. + #[track_caller] + fn assert_same_ops<'a>( + a: impl IntoIterator, + b: impl IntoIterator, + ) { + a.into_iter() + .zip_eq(b) + .for_each(|(a, b)| assert_same_op(a, b)) + } + + /// Test helper to construct a test timeline. + /// + /// TODO: it really shouldn't be necessary to construct an entire tenant and timeline just to + /// test the upload queue -- decouple ResidentLayer from Timeline. + /// + /// TODO: the upload queue uses TimelineMetadata::example() instead, because there's no way to + /// obtain a TimelineMetadata from a Timeline. + fn make_timeline() -> Arc { + // Grab the current test name from the current thread name. + // TODO: TenantHarness shouldn't take a &'static str, but just leak the test name for now. + let test_name = std::thread::current().name().unwrap().to_string(); + let test_name = Box::leak(test_name.into_boxed_str()); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create runtime"); + + runtime + .block_on(async { + let harness = TenantHarness::create(test_name).await?; + let (tenant, ctx) = harness.load().await; + tenant + .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx) + .await + }) + .expect("failed to create timeline") + } + + /// Test helper to construct an (empty) resident layer. + fn make_layer(timeline: &Arc, name: &str) -> ResidentLayer { + make_layer_with_size(timeline, name, 0) + } + + /// Test helper to construct a resident layer with the given size. + fn make_layer_with_size(timeline: &Arc, name: &str, size: usize) -> ResidentLayer { + let metadata = LayerFileMetadata { + generation: timeline.generation, + shard: timeline.get_shard_index(), + file_size: size as u64, + }; + make_layer_with_metadata(timeline, name, metadata) + } + + /// Test helper to construct a layer with the given metadata. + fn make_layer_with_metadata( + timeline: &Arc, + name: &str, + metadata: LayerFileMetadata, + ) -> ResidentLayer { + let name = LayerName::from_str(name).expect("invalid name"); + let local_path = local_layer_path( + timeline.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &metadata.generation, + ); + std::fs::write(&local_path, vec![0; metadata.file_size as usize]) + .expect("failed to write file"); + Layer::for_resident(timeline.conf, timeline, local_path, name, metadata) + } + + /// Test helper to add a layer to an index and return a new index. + fn index_with(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + Box::new(index) + } + + /// Test helper to remove a layer from an index and return a new index. + fn index_without(index: &IndexPart, layer: &ResidentLayer) -> Box { + let mut index = index.clone(); + index + .layer_metadata + .remove(&layer.layer_desc().layer_name()); + Box::new(index) + } + + /// Nothing can bypass a barrier, and it can't bypass inprogress tasks. + #[test] + fn schedule_barrier() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let (barrier, _) = tokio::sync::watch::channel(()); + + // Enqueue non-conflicting upload, delete, and index before and after a barrier. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Barrier(barrier), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the barrier. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Barrier(_)) + )); + + // Complete the initial operations. The barrier isn't scheduled while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // Schedule the barrier. The later tasks won't schedule until it completes. + let tasks = queue.schedule_ready(); + + assert_eq!(tasks.len(), 1); + assert!(matches!(tasks[0].op, UploadOp::Barrier(_))); + assert_eq!(queue.queued_operations.len(), 3); + + // Complete the barrier. The rest of the tasks schedule immediately. + queue.complete(tasks[0].task_id); + + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[4..]); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Deletes can be scheduled in parallel, even if they're for the same file. + #[test] + fn schedule_delete_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + // Enqueue a bunch of deletes, some with conflicting names. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::Delete(Delete { + layers: vec![(layer0.layer_desc().layer_name(), layer0.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![ + (layer1.layer_desc().layer_name(), layer1.metadata()), + (layer2.layer_desc().layer_name(), layer2.metadata()), + ], + }), + UploadOp::Delete(Delete { + layers: vec![(layer2.layer_desc().layer_name(), layer2.metadata())], + }), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Since deletes don't conflict, they're all scheduled. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads are serialized. + #[test] + fn schedule_upload_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three versions of the same layer, with different file sizes. + let layer0a = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 1); + let layer0b = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 2); + let layer0c = make_layer_with_size(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51", 3); + + let ops = [ + UploadOp::UploadLayer(layer0a.clone(), layer0a.metadata(), None), + UploadOp::UploadLayer(layer0b.clone(), layer0b.metadata(), None), + UploadOp::UploadLayer(layer0c.clone(), layer0c.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Conflicting uploads and deletes are serialized. + #[test] + fn schedule_upload_delete_conflicts() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Only one version should be scheduled and uploaded at a time. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.schedule_ready().is_empty()); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Non-conflicting uploads and deletes can bypass the queue, avoiding the conflicting + /// delete/upload operations at the head of the queue. + #[test] + fn schedule_upload_delete_conflicts_bypass() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue two layer uploads, with a delete of both layers in between them. These should be + // scheduled one at a time, since deletes can't bypass uploads and vice versa. + // + // Also enqueue non-conflicting uploads and deletes at the end. These can bypass the queue + // and run immediately. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![ + (layer0.layer_desc().layer_name(), layer0.metadata()), + (layer1.layer_desc().layer_name(), layer1.metadata()), + ], + }), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations 0, 3, and 4 are scheduled immediately. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), [&ops[0], &ops[3], &ops[4]]); + assert_eq!(queue.queued_operations.len(), 2); + + Ok(()) + } + + /// Non-conflicting uploads are parallelized. + #[test] + fn schedule_upload_parallel() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three different layer uploads. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // All uploads should be scheduled concurrently. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops); + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Index uploads are coalesced. + #[test] + fn schedule_index_coalesce() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + + let ops = [ + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The index uploads are coalesced into a single operation. + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &ops[2]); + assert_same_ops(&tasks[0].coalesced_ops, &ops[0..2]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Chains of upload/index operations lead to parallel layer uploads and serial index uploads. + /// This is the common case with layer flushes. + #[test] + fn schedule_index_upload_chain() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Enqueue three uploads of the current empty index. + let index = Box::new(queue.clean.0.clone()); + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index0 = index_with(&index, &layer0); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index1 = index_with(&index0, &layer1); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index2 = index_with(&index1, &layer2); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index0.clone(), + }, + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index1.clone(), + }, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index2.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // The layer uploads should be scheduled immediately. The indexes must wait. + let upload_tasks = queue.schedule_ready(); + assert_same_ops( + upload_tasks.iter().map(|t| &t.op), + [&ops[0], &ops[2], &ops[4]], + ); + + // layer2 completes first. None of the indexes can upload yet. + queue.complete(upload_tasks[2].task_id); + assert!(queue.schedule_ready().is_empty()); + + // layer0 completes. index0 can upload. It completes. + queue.complete(upload_tasks[0].task_id); + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[1]); + queue.complete(index_tasks[0].task_id); + + // layer 1 completes. This unblocks index 1 and 2, which coalesce into + // a single upload for index 2. + queue.complete(upload_tasks[1].task_id); + + let index_tasks = queue.schedule_ready(); + assert_eq!(index_tasks.len(), 1); + assert_same_op(&index_tasks[0].op, &ops[5]); + assert_same_ops(&index_tasks[0].coalesced_ops, &ops[3..4]); + + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// A delete can't bypass an index upload if an index ahead of it still references it. + #[test] + fn schedule_index_delete_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let index_upload = index_with(&queue.clean.0, &layer); + + // Remove the layer reference in a new index, then delete the layer. + let index_deref = index_without(&index_upload, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer and delete it. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// An upload with a reused layer name doesn't clobber the previous layer. Specifically, a + /// dereference/upload/reference cycle can't allow the upload to bypass the reference. + #[test] + fn schedule_index_upload_dereferenced() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_with_current_remote_index_part(&IndexPart::example(), 0)?; + let tli = make_timeline(); + + // Create a layer to upload. + let layer = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + // Upload the layer. Then dereference the layer, and upload/reference it again. + let index_upload = index_with(&queue.clean.0, &layer); + let index_deref = index_without(&index_upload, &layer); + let index_ref = index_with(&index_deref, &layer); + + let ops = [ + // Initial upload, with a barrier to prevent index coalescing. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_upload.clone(), + }, + UploadOp::Barrier(tokio::sync::watch::channel(()).0), + // Dereference the layer. + UploadOp::UploadMetadata { + uploaded: index_deref.clone(), + }, + // Replace and reference the layer. + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::UploadMetadata { + uploaded: index_ref.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Operations are serialized. + for op in ops { + let tasks = queue.schedule_ready(); + assert_eq!(tasks.len(), 1); + assert_same_op(&tasks[0].op, &op); + queue.complete(tasks[0].task_id); + } + assert!(queue.queued_operations.is_empty()); + + Ok(()) + } + + /// Nothing can bypass a shutdown, and it waits for inprogress tasks. It's never returned from + /// next_ready(), but is left at the head of the queue. + #[test] + fn schedule_shutdown() -> anyhow::Result<()> { + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 0)?; + let tli = make_timeline(); + + let index = Box::new(queue.clean.0.clone()); // empty, doesn't matter + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + // Enqueue non-conflicting upload, delete, and index before and after a shutdown. + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer1.layer_desc().layer_name(), layer1.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + UploadOp::Shutdown, + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer3.layer_desc().layer_name(), layer3.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: index.clone(), + }, + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule the initial operations ahead of the shutdown. + let tasks = queue.schedule_ready(); + + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..3]); + assert!(matches!( + queue.queued_operations.front(), + Some(&UploadOp::Shutdown) + )); + + // Complete the initial operations. The shutdown isn't triggered while they're pending. + for task in tasks { + assert!(queue.schedule_ready().is_empty()); + queue.complete(task.task_id); + } + + // The shutdown is triggered the next time we try to pull an operation. It isn't returned, + // but is left in the queue. + assert!(!queue.shutdown_ready.is_closed()); + assert!(queue.next_ready().is_none()); + assert!(queue.shutdown_ready.is_closed()); + + Ok(()) + } + + /// Scheduling respects inprogress_limit. + #[test] + fn schedule_inprogress_limit() -> anyhow::Result<()> { + // Create a queue with inprogress_limit=2. + let mut queue = UploadQueue::Uninitialized; + let queue = queue.initialize_empty_remote(&TimelineMetadata::example(), 2)?; + let tli = make_timeline(); + + // Enqueue a bunch of uploads. + let layer0 = make_layer(&tli, "000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer1 = make_layer(&tli, "100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer2 = make_layer(&tli, "200000000000000000000000000000000000-300000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + let layer3 = make_layer(&tli, "300000000000000000000000000000000000-400000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"); + + let ops = [ + UploadOp::UploadLayer(layer0.clone(), layer0.metadata(), None), + UploadOp::UploadLayer(layer1.clone(), layer1.metadata(), None), + UploadOp::UploadLayer(layer2.clone(), layer2.metadata(), None), + UploadOp::UploadLayer(layer3.clone(), layer3.metadata(), None), + ]; + + queue.queued_operations.extend(ops.clone()); + + // Schedule all ready operations. Only 2 are scheduled. + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[0..2]); + assert!(queue.next_ready().is_none()); + + // When one completes, another is scheduled. + queue.complete(tasks[0].task_id); + let tasks = queue.schedule_ready(); + assert_same_ops(tasks.iter().map(|t| &t.op), &ops[2..3]); + + Ok(()) + } + + /// Tests that can_bypass takes name, generation and shard index into account for all operations. + #[test] + fn can_bypass_path() -> anyhow::Result<()> { + let tli = make_timeline(); + + let name0 = &"000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + let name1 = &"100000000000000000000000000000000000-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51"; + + // Asserts that layers a and b either can or can't bypass each other, for all combinations + // of operations (except Delete and UploadMetadata which are special-cased). + #[track_caller] + fn assert_can_bypass(a: ResidentLayer, b: ResidentLayer, can_bypass: bool) { + let index = IndexPart::empty(TimelineMetadata::example()); + for (a, b) in make_ops(a).into_iter().zip(make_ops(b)) { + match (&a, &b) { + // Deletes can always bypass each other. + (UploadOp::Delete(_), UploadOp::Delete(_)) => assert!(a.can_bypass(&b, &index)), + // Indexes can never bypass each other. + (UploadOp::UploadMetadata { .. }, UploadOp::UploadMetadata { .. }) => { + assert!(!a.can_bypass(&b, &index)) + } + // For other operations, assert as requested. + (a, b) => assert_eq!(a.can_bypass(b, &index), can_bypass), + } + } + } + + fn make_ops(layer: ResidentLayer) -> Vec { + let mut index = IndexPart::empty(TimelineMetadata::example()); + index + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + vec![ + UploadOp::UploadLayer(layer.clone(), layer.metadata(), None), + UploadOp::Delete(Delete { + layers: vec![(layer.layer_desc().layer_name(), layer.metadata())], + }), + UploadOp::UploadMetadata { + uploaded: Box::new(index), + }, + ] + } + + // Makes a ResidentLayer. + let layer = |name: &'static str, shard: Option, generation: u32| -> ResidentLayer { + let shard = shard + .map(|n| ShardIndex::new(ShardNumber(n), ShardCount(8))) + .unwrap_or(ShardIndex::unsharded()); + let metadata = LayerFileMetadata { + shard, + generation: Generation::Valid(generation), + file_size: 0, + }; + make_layer_with_metadata(&tli, name, metadata) + }; + + // Same name and metadata can't bypass. This goes both for unsharded and sharded, as well as + // 0 or >0 generation. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 0), false); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(0), 0), false); + assert_can_bypass(layer(name0, None, 1), layer(name0, None, 1), false); + + // Different names can bypass. + assert_can_bypass(layer(name0, None, 0), layer(name1, None, 0), true); + + // Different shards can bypass. Shard 0 is different from unsharded. + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, Some(1), 0), true); + assert_can_bypass(layer(name0, Some(0), 0), layer(name0, None, 0), true); + + // Different generations can bypass, both sharded and unsharded. + assert_can_bypass(layer(name0, None, 0), layer(name0, None, 1), true); + assert_can_bypass(layer(name0, Some(1), 0), layer(name0, Some(1), 1), true); + + Ok(()) + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e5b23fed51..7253af8507 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -308,7 +308,7 @@ impl WalIngest { epoch -= 1; } - Ok((epoch as u64) << 32 | xid as u64) + Ok(((epoch as u64) << 32) | xid as u64) } async fn ingest_clear_vm_bits( diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile deleted file mode 100644 index 66436b5920..0000000000 --- a/pgxn/hnsw/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -EXTENSION = hnsw -EXTVERSION = 0.1.0 - -MODULE_big = hnsw -DATA = $(wildcard *--*.sql) -OBJS = hnsw.o hnswalg.o - -TESTS = $(wildcard test/sql/*.sql) -REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) -REGRESS_OPTS = --inputdir=test --load-extension=hnsw - -# For auto-vectorization: -# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html -PG_CFLAGS += -O3 -PG_CXXFLAGS += -O3 -std=c++11 -PG_LDFLAGS += -lstdc++ - -all: $(EXTENSION)--$(EXTVERSION).sql - -PG_CONFIG ?= pg_config -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) - -dist: - mkdir -p dist - git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md deleted file mode 100644 index bc9c8d571c..0000000000 --- a/pgxn/hnsw/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors - -This ANN extension of Postgres is based -on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw), -the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper: - -[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html), -
-Dmitry Baranchuk, Artem Babenko, Yury Malkov - -# Postgres extension - -HNSW index is hold in memory (built on demand) and it's maxial size is limited -by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type). -Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters -described in the article). - -# Example of usage: - -``` -create extension hnsw; -create table embeddings(id integer primary key, payload real[]); -create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32); -select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100; -``` \ No newline at end of file diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql deleted file mode 100644 index ebf424326d..0000000000 --- a/pgxn/hnsw/hnsw--0.1.0.sql +++ /dev/null @@ -1,29 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION hnsw" to load this file. \quit - --- functions - -CREATE FUNCTION l2_distance(real[], real[]) RETURNS real - AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; - --- operators - -CREATE OPERATOR <-> ( - LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance, - COMMUTATOR = '<->' -); - --- access method - -CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler - AS 'MODULE_PATHNAME' LANGUAGE C; - -CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler; - -COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method'; - --- opclasses - -CREATE OPERATOR CLASS knn_ops - DEFAULT FOR TYPE real[] USING hnsw AS - OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops; diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c deleted file mode 100644 index e624cb831f..0000000000 --- a/pgxn/hnsw/hnsw.c +++ /dev/null @@ -1,590 +0,0 @@ -#include "postgres.h" - -#include "access/amapi.h" -#include "access/generic_xlog.h" -#include "access/relation.h" -#include "access/reloptions.h" -#include "access/tableam.h" -#include "catalog/index.h" -#include "commands/vacuum.h" -#include "nodes/execnodes.h" -#include "storage/bufmgr.h" -#include "utils/guc.h" -#include "utils/selfuncs.h" - -#include -#include - -#include "hnsw.h" - -PG_MODULE_MAGIC; - -typedef struct { - int32 vl_len_; /* varlena header (do not touch directly!) */ - int dims; - int maxelements; - int efConstruction; - int efSearch; - int M; -} HnswOptions; - -static relopt_kind hnsw_relopt_kind; - -typedef struct { - HierarchicalNSW* hnsw; - size_t curr; - size_t n_results; - ItemPointer results; -} HnswScanOpaqueData; - -typedef HnswScanOpaqueData* HnswScanOpaque; - -typedef struct { - Oid relid; - uint32 status; - HierarchicalNSW* hnsw; -} HnswHashEntry; - - -#define SH_PREFIX hnsw_index -#define SH_ELEMENT_TYPE HnswHashEntry -#define SH_KEY_TYPE Oid -#define SH_KEY relid -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->relid) -#define SH_HASH_KEY(tb, key) (key) -#define SH_EQUAL(tb, a, b) ((a) == (b)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -#define INDEX_HASH_SIZE 11 - -#define DEFAULT_EF_SEARCH 64 - -PGDLLEXPORT void _PG_init(void); - -static hnsw_index_hash *hnsw_indexes; - -/* - * Initialize index options and variables - */ -void -_PG_init(void) -{ - hnsw_relopt_kind = add_reloption_kind(); - add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements", - 0, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex", - 100, 0, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction", - 16, 1, INT_MAX, AccessExclusiveLock); - add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search", - 64, 1, INT_MAX, AccessExclusiveLock); - hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL); -} - - -static void -hnsw_build_callback(Relation index, ItemPointer tid, Datum *values, - bool *isnull, bool tupleIsAlive, void *state) -{ - HierarchicalNSW* hnsw = (HierarchicalNSW*) state; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return; - - array = DatumGetArrayTypeP(values[0]); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - - memcpy(&label, tid, sizeof(*tid)); - hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label); -} - -static void -hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel) -{ - IndexInfo* indexInfo = BuildIndexInfo(indexRel); - Assert(indexInfo->ii_NumIndexAttrs == 1); - table_index_build_scan(heapRel, indexRel, indexInfo, - true, true, hnsw_build_callback, (void *) hnsw, NULL); -} - -#ifdef __APPLE__ - -#include -#include - -static void -hnsw_check_available_memory(Size requested) -{ - size_t total; - if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#else - -#include - -static void -hnsw_check_available_memory(Size requested) -{ - struct sysinfo si; - Size total; - if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %m"); - - total = si.totalram*si.mem_unit; - if ((Size)NBuffers*BLCKSZ + requested >= total) - elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available", - requested, total - (Size)NBuffers*BLCKSZ); -} - -#endif - -static HierarchicalNSW* -hnsw_get_index(Relation indexRel, Relation heapRel) -{ - HierarchicalNSW* hnsw; - Oid indexoid = RelationGetRelid(indexRel); - HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid); - if (entry == NULL) - { - size_t dims, maxelements; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t size_data_per_element; - size_t data_size; - dsm_handle handle = indexoid << 1; /* make it even */ - void* impl_private = NULL; - void* mapped_address = NULL; - Size mapped_size = 0; - Size shmem_size; - bool exists = true; - bool found; - HnswOptions *opts = (HnswOptions *) indexRel->rd_options; - if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) { - elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified"); - } - dims = opts->dims; - maxelements = opts->maxelements; - M = opts->M; - maxM = M * 2; - data_size = dims * sizeof(coord_t); - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - shmem_size = hnsw_sizeof() + maxelements * size_data_per_element; - - hnsw_check_available_memory(shmem_size); - - /* first try to attach to existed index */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* index doesn't exists: try to create it */ - if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private, - &mapped_address, &mapped_size, DEBUG1)) - { - /* We can do it under shared lock, so some other backend may - * try to initialize index. If create is failed because index already - * created by somebody else, then try to attach to it once again - */ - if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private, - &mapped_address, &mapped_size, ERROR)) - { - return NULL; - } - } - else - { - exists = false; - } - } - Assert(mapped_size == shmem_size); - hnsw = (HierarchicalNSW*)mapped_address; - - if (!exists) - { - hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction); - hnsw_populate(hnsw, indexRel, heapRel); - } - entry = hnsw_index_insert(hnsw_indexes, indexoid, &found); - Assert(!found); - entry->hnsw = hnsw; - } - else - { - hnsw = entry->hnsw; - } - return hnsw; -} - -/* - * Start or restart an index scan - */ -static IndexScanDesc -hnsw_beginscan(Relation index, int nkeys, int norderbys) -{ - IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys); - HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData)); - Relation heap = relation_open(index->rd_index->indrelid, NoLock); - so->hnsw = hnsw_get_index(index, heap); - relation_close(heap, NoLock); - so->curr = 0; - so->n_results = 0; - so->results = NULL; - scan->opaque = so; - return scan; -} - -/* - * Start or restart an index scan - */ -static void -hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - { - pfree(so->results); - so->results = NULL; - } - so->curr = 0; - if (orderbys && scan->numberOfOrderBys > 0) - memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); -} - -/* - * Fetch the next tuple in the given scan - */ -static bool -hnsw_gettuple(IndexScanDesc scan, ScanDirection dir) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - - /* - * Index can be used to scan backward, but Postgres doesn't support - * backward scan on operators - */ - Assert(ScanDirectionIsForward(dir)); - - if (so->curr == 0) - { - Datum value; - ArrayType* array; - int n_items; - size_t n_results; - label_t* results; - HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options; - size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH; - - /* Safety check */ - if (scan->orderByData == NULL) - elog(ERROR, "cannot scan HNSW index without order"); - - /* No items will match if null */ - if (scan->orderByData->sk_flags & SK_ISNULL) - return false; - - value = scan->orderByData->sk_argument; - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(so->hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(so->hnsw)); - } - - if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results)) - elog(ERROR, "HNSW index search failed"); - so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData)); - so->n_results = n_results; - for (size_t i = 0; i < n_results; i++) - { - memcpy(&so->results[i], &results[i], sizeof(so->results[i])); - } - free(results); - } - if (so->curr >= so->n_results) - { - return false; - } - else - { - scan->xs_heaptid = so->results[so->curr++]; - scan->xs_recheckorderby = false; - return true; - } -} - -/* - * End a scan and release resources - */ -static void -hnsw_endscan(IndexScanDesc scan) -{ - HnswScanOpaque so = (HnswScanOpaque) scan->opaque; - if (so->results) - pfree(so->results); - pfree(so); - scan->opaque = NULL; -} - - -/* - * Estimate the cost of an index scan - */ -static void -hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, - Cost *indexStartupCost, Cost *indexTotalCost, - Selectivity *indexSelectivity, double *indexCorrelation - ,double *indexPages -) -{ - GenericCosts costs; - - /* Never use index without order */ - if (path->indexorderbys == NULL) - { - *indexStartupCost = DBL_MAX; - *indexTotalCost = DBL_MAX; - *indexSelectivity = 0; - *indexCorrelation = 0; - *indexPages = 0; - return; - } - - MemSet(&costs, 0, sizeof(costs)); - - genericcostestimate(root, path, loop_count, &costs); - - /* Startup cost and total cost are same */ - *indexStartupCost = costs.indexTotalCost; - *indexTotalCost = costs.indexTotalCost; - *indexSelectivity = costs.indexSelectivity; - *indexCorrelation = costs.indexCorrelation; - *indexPages = costs.numIndexPages; -} - -/* - * Parse and validate the reloptions - */ -static bytea * -hnsw_options(Datum reloptions, bool validate) -{ - static const relopt_parse_elt tab[] = { - {"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)}, - {"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)}, - {"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)}, - {"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)}, - {"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)} - }; - - return (bytea *) build_reloptions(reloptions, validate, - hnsw_relopt_kind, - sizeof(HnswOptions), - tab, lengthof(tab)); -} - -/* - * Validate catalog entries for the specified operator class - */ -static bool -hnsw_validate(Oid opclassoid) -{ - return true; -} - -/* - * Build the index for a logged table - */ -static IndexBuildResult * -hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); - result->heap_tuples = result->index_tuples = hnsw_count(hnsw); - - return result; -} - -/* - * Insert a tuple into the index - */ -static bool -hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid, - Relation heap, IndexUniqueCheck checkUnique, - bool indexUnchanged, - IndexInfo *indexInfo) -{ - HierarchicalNSW* hnsw = hnsw_get_index(index, heap); - Datum value; - ArrayType* array; - int n_items; - label_t label = 0; - - /* Skip nulls */ - if (isnull[0]) - return false; - - /* Detoast value */ - value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); - array = DatumGetArrayTypeP(value); - n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array)); - if (n_items != hnsw_dimensions(hnsw)) - { - elog(ERROR, "Wrong number of dimensions: %d instead of %d expected", - n_items, hnsw_dimensions(hnsw)); - } - memcpy(&label, heap_tid, sizeof(*heap_tid)); - if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label)) - elog(ERROR, "HNSW index insert failed"); - return true; -} - -/* - * Build the index for an unlogged table - */ -static void -hnsw_buildempty(Relation index) -{ - /* index will be constructed on dema nd when accessed */ -} - -/* - * Clean up after a VACUUM operation - */ -static IndexBulkDeleteResult * -hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) -{ - Relation rel = info->index; - - if (stats == NULL) - return NULL; - - stats->num_pages = RelationGetNumberOfBlocks(rel); - - return stats; -} - -/* - * Bulk delete tuples from the index - */ -static IndexBulkDeleteResult * -hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, - IndexBulkDeleteCallback callback, void *callback_state) -{ - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - return stats; -} - -/* - * Define index handler - * - * See https://www.postgresql.org/docs/current/index-api.html - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler); -Datum -hnsw_handler(PG_FUNCTION_ARGS) -{ - IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); - - amroutine->amstrategies = 0; - amroutine->amsupport = 0; - amroutine->amoptsprocnum = 0; - amroutine->amcanorder = false; - amroutine->amcanorderbyop = true; - amroutine->amcanbackward = false; /* can change direction mid-scan */ - amroutine->amcanunique = false; - amroutine->amcanmulticol = false; - amroutine->amoptionalkey = true; - amroutine->amsearcharray = false; - amroutine->amsearchnulls = false; - amroutine->amstorage = false; - amroutine->amclusterable = false; - amroutine->ampredlocks = false; - amroutine->amcanparallel = false; - amroutine->amcaninclude = false; - amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */ - amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL; - amroutine->amkeytype = InvalidOid; - - /* Interface functions */ - amroutine->ambuild = hnsw_build; - amroutine->ambuildempty = hnsw_buildempty; - amroutine->aminsert = hnsw_insert; - amroutine->ambulkdelete = hnsw_bulkdelete; - amroutine->amvacuumcleanup = hnsw_vacuumcleanup; - amroutine->amcanreturn = NULL; /* tuple not included in heapsort */ - amroutine->amcostestimate = hnsw_costestimate; - amroutine->amoptions = hnsw_options; - amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */ - amroutine->ambuildphasename = NULL; - amroutine->amvalidate = hnsw_validate; - amroutine->amadjustmembers = NULL; - amroutine->ambeginscan = hnsw_beginscan; - amroutine->amrescan = hnsw_rescan; - amroutine->amgettuple = hnsw_gettuple; - amroutine->amgetbitmap = NULL; - amroutine->amendscan = hnsw_endscan; - amroutine->ammarkpos = NULL; - amroutine->amrestrpos = NULL; - - /* Interface functions to support parallel index scans */ - amroutine->amestimateparallelscan = NULL; - amroutine->aminitparallelscan = NULL; - amroutine->amparallelrescan = NULL; - - PG_RETURN_POINTER(amroutine); -} - -/* - * Get the L2 distance between vectors - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance); -Datum -l2_distance(PG_FUNCTION_ARGS) -{ - ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); - ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); - int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a)); - int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b)); - dist_t distance = 0.0; - dist_t diff; - coord_t *ax = (coord_t*)ARR_DATA_PTR(a); - coord_t *bx = (coord_t*)ARR_DATA_PTR(b); - - if (a_dim != b_dim) - { - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("different array dimensions %d and %d", a_dim, b_dim))); - } - - for (int i = 0; i < a_dim; i++) - { - diff = ax[i] - bx[i]; - distance += diff * diff; - } - - PG_RETURN_FLOAT4((dist_t)sqrt(distance)); -} diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control deleted file mode 100644 index fbfa1a5b47..0000000000 --- a/pgxn/hnsw/hnsw.control +++ /dev/null @@ -1,4 +0,0 @@ -comment = '** Deprecated ** Please use pg_embedding instead' -default_version = '0.1.0' -module_pathname = '$libdir/hnsw' -relocatable = true diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h deleted file mode 100644 index d4065ab8fe..0000000000 --- a/pgxn/hnsw/hnsw.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -typedef float coord_t; -typedef float dist_t; -typedef uint32_t idx_t; -typedef uint64_t label_t; - -typedef struct HierarchicalNSW HierarchicalNSW; - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results); -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label); -void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); -int hnsw_dimensions(HierarchicalNSW* hnsw); -size_t hnsw_count(HierarchicalNSW* hnsw); -size_t hnsw_sizeof(void); diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp deleted file mode 100644 index f6de3b8314..0000000000 --- a/pgxn/hnsw/hnswalg.cpp +++ /dev/null @@ -1,379 +0,0 @@ -#include "hnswalg.h" - -#if defined(__GNUC__) -#define PORTABLE_ALIGN32 __attribute__((aligned(32))) -#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint) -#else -#define PORTABLE_ALIGN32 __declspec(align(32)) -#define PREFETCH(addr,hint) -#endif - -HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_) -{ - dim = dim_; - data_size = dim * sizeof(coord_t); - - efConstruction = efConstruction_; - - maxelements = maxelements_; - M = M_; - maxM = maxM_; - size_links_level0 = (maxM + 1) * sizeof(idx_t); - size_data_per_element = size_links_level0 + data_size + sizeof(label_t); - offset_data = size_links_level0; - offset_label = offset_data + data_size; - - enterpoint_node = 0; - cur_element_count = 0; -#ifdef __x86_64__ - use_avx2 = __builtin_cpu_supports("avx2"); -#endif -} - -std::priority_queue> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef) -{ - std::vector visited; - visited.resize((cur_element_count + 31) >> 5); - - std::priority_queue> topResults; - std::priority_queue> candidateSet; - - dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node)); - - topResults.emplace(dist, enterpoint_node); - candidateSet.emplace(-dist, enterpoint_node); - visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31); - dist_t lowerBound = dist; - - while (!candidateSet.empty()) - { - std::pair curr_el_pair = candidateSet.top(); - if (-curr_el_pair.first > lowerBound) - break; - - candidateSet.pop(); - idx_t curNodeNum = curr_el_pair.second; - - idx_t* data = get_linklist0(curNodeNum); - size_t size = *data++; - - PREFETCH(getDataByInternalId(*data), 0); - - for (size_t j = 0; j < size; ++j) { - size_t tnum = *(data + j); - - PREFETCH(getDataByInternalId(*(data + j + 1)), 0); - - if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) { - visited[tnum >> 5] |= 1 << (tnum & 31); - - dist = fstdistfunc(point, getDataByInternalId(tnum)); - - if (topResults.top().first > dist || topResults.size() < ef) { - candidateSet.emplace(-dist, tnum); - - PREFETCH(get_linklist0(candidateSet.top().second), 0); - topResults.emplace(dist, tnum); - - if (topResults.size() > ef) - topResults.pop(); - - lowerBound = topResults.top().first; - } - } - } - } - return topResults; -} - - -void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN) -{ - if (topResults.size() < NN) - return; - - std::priority_queue> resultSet; - std::vector> returnlist; - - while (topResults.size() > 0) { - resultSet.emplace(-topResults.top().first, topResults.top().second); - topResults.pop(); - } - - while (resultSet.size()) { - if (returnlist.size() >= NN) - break; - std::pair curen = resultSet.top(); - dist_t dist_to_query = -curen.first; - resultSet.pop(); - bool good = true; - for (std::pair curen2 : returnlist) { - dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second), - getDataByInternalId(curen.second)); - if (curdist < dist_to_query) { - good = false; - break; - } - } - if (good) returnlist.push_back(curen); - } - for (std::pair elem : returnlist) - topResults.emplace(-elem.first, elem.second); -} - -void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c, - std::priority_queue> topResults) -{ - getNeighborsByHeuristic(topResults, M); - - std::vector res; - res.reserve(M); - while (topResults.size() > 0) { - res.push_back(topResults.top().second); - topResults.pop(); - } - { - idx_t* data = get_linklist0(cur_c); - if (*data) - throw std::runtime_error("Should be blank"); - - *data++ = res.size(); - - for (size_t idx = 0; idx < res.size(); idx++) { - if (data[idx]) - throw std::runtime_error("Should be blank"); - data[idx] = res[idx]; - } - } - for (size_t idx = 0; idx < res.size(); idx++) { - if (res[idx] == cur_c) - throw std::runtime_error("Connection to the same element"); - - size_t resMmax = maxM; - idx_t *ll_other = get_linklist0(res[idx]); - idx_t sz_link_list_other = *ll_other; - - if (sz_link_list_other > resMmax || sz_link_list_other < 0) - throw std::runtime_error("Bad sz_link_list_other"); - - if (sz_link_list_other < resMmax) { - idx_t *data = ll_other + 1; - data[sz_link_list_other] = cur_c; - *ll_other = sz_link_list_other + 1; - } else { - // finding the "weakest" element to replace it with the new one - idx_t *data = ll_other + 1; - dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx])); - // Heuristic: - std::priority_queue> candidates; - candidates.emplace(d_max, cur_c); - - for (size_t j = 0; j < sz_link_list_other; j++) - candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]); - - getNeighborsByHeuristic(candidates, resMmax); - - size_t indx = 0; - while (!candidates.empty()) { - data[indx] = candidates.top().second; - candidates.pop(); - indx++; - } - *ll_other = indx; - } - } -} - -void HierarchicalNSW::addPoint(const coord_t *point, label_t label) -{ - if (cur_element_count >= maxelements) { - throw std::runtime_error("The number of elements exceeds the specified limit"); - } - idx_t cur_c = cur_element_count++; - memset((char *) get_linklist0(cur_c), 0, size_data_per_element); - memcpy(getDataByInternalId(cur_c), point, data_size); - memcpy(getExternalLabel(cur_c), &label, sizeof label); - - // Do nothing for the first element - if (cur_c != 0) { - std::priority_queue > topResults = searchBaseLayer(point, efConstruction); - mutuallyConnectNewElement(point, cur_c, topResults); - } -}; - -std::priority_queue> HierarchicalNSW::searchKnn(const coord_t *query, size_t k) -{ - std::priority_queue> topResults; - auto topCandidates = searchBaseLayer(query, k); - while (topCandidates.size() > k) { - topCandidates.pop(); - } - while (!topCandidates.empty()) { - std::pair rez = topCandidates.top(); - label_t label; - memcpy(&label, getExternalLabel(rez.second), sizeof(label)); - topResults.push(std::pair(rez.first, label)); - topCandidates.pop(); - } - - return topResults; -}; - -dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n) -{ - dist_t distance = 0.0; - - for (size_t i = 0; i < n; i++) - { - dist_t diff = x[i] - y[i]; - distance += diff * diff; - } - return distance; - -} - -#ifdef __x86_64__ -#include - -__attribute__((target("avx2"))) -dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m256) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(x); - x += 8; - v2 = _mm256_loadu_ps(y); - y += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - return (res); -} - -dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n) -{ - const size_t TmpResSz = sizeof(__m128) / sizeof(float); - float PORTABLE_ALIGN32 TmpRes[TmpResSz]; - size_t qty16 = n / 16; - const float *pEnd1 = x + (qty16 * 16); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (x < pEnd1) { - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(x); - x += 4; - v2 = _mm_loadu_ps(y); - y += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - return res; -} -#endif - -dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y) -{ -#ifndef __x86_64__ - return fstdistfunc_scalar(x, y, dim); -#else - if(use_avx2) - return fstdistfunc_avx2(x, y, dim); - - return fstdistfunc_sse(x, y, dim); -#endif -} - -bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results) -{ - try - { - auto result = hnsw->searchKnn(point, efSearch); - size_t nResults = result.size(); - *results = (label_t*)malloc(nResults*sizeof(label_t)); - for (size_t i = nResults; i-- != 0;) - { - (*results)[i] = result.top().second; - result.pop(); - } - *n_results = nResults; - return true; - } - catch (std::exception& x) - { - return false; - } -} - -bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label) -{ - try - { - hnsw->addPoint(point, label); - return true; - } - catch (std::exception& x) - { - fprintf(stderr, "Catch %s\n", x.what()); - return false; - } -} - -void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction) -{ - new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction); -} - - -int hnsw_dimensions(HierarchicalNSW* hnsw) -{ - return (int)hnsw->dim; -} - -size_t hnsw_count(HierarchicalNSW* hnsw) -{ - return hnsw->cur_element_count; -} - -size_t hnsw_sizeof(void) -{ - return sizeof(HierarchicalNSW); -} diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h deleted file mode 100644 index f38aeac362..0000000000 --- a/pgxn/hnsw/hnswalg.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -extern "C" { -#include "hnsw.h" -} - -struct HierarchicalNSW -{ - size_t maxelements; - size_t cur_element_count; - - idx_t enterpoint_node; - - size_t dim; - size_t data_size; - size_t offset_data; - size_t offset_label; - size_t size_data_per_element; - size_t M; - size_t maxM; - size_t size_links_level0; - size_t efConstruction; - -#ifdef __x86_64__ - bool use_avx2; -#endif - - char data_level0_memory[0]; // varying size - - public: - HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction); - ~HierarchicalNSW(); - - - inline coord_t *getDataByInternalId(idx_t internal_id) const { - return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data]; - } - - inline idx_t *get_linklist0(idx_t internal_id) const { - return (idx_t*)&data_level0_memory[internal_id * size_data_per_element]; - } - - inline label_t *getExternalLabel(idx_t internal_id) const { - return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label]; - } - - std::priority_queue> searchBaseLayer(const coord_t *x, size_t ef); - - void getNeighborsByHeuristic(std::priority_queue> &topResults, size_t NN); - - void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue> topResults); - - void addPoint(const coord_t *point, label_t label); - - std::priority_queue> searchKnn(const coord_t *query_data, size_t k); - - dist_t fstdistfunc(const coord_t *x, const coord_t *y); -}; diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out deleted file mode 100644 index a1cee4525e..0000000000 --- a/pgxn/hnsw/test/expected/knn.out +++ /dev/null @@ -1,28 +0,0 @@ -SET enable_seqscan = off; -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); -INSERT INTO t (val) VALUES (array[1,2,4]); -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; - QUERY PLAN --------------------------------------------------------------------- - Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36) - Order By: (val <-> '{3,3,3}'::real[]) -(2 rows) - -SELECT * FROM t ORDER BY val <-> array[3,3,3]; - val ---------- - {1,2,3} - {1,2,4} - {1,1,1} - {0,0,0} -(4 rows) - -SELECT COUNT(*) FROM t; - count -------- - 5 -(1 row) - -DROP TABLE t; diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql deleted file mode 100644 index 0635bda4a2..0000000000 --- a/pgxn/hnsw/test/sql/knn.sql +++ /dev/null @@ -1,13 +0,0 @@ -SET enable_seqscan = off; - -CREATE TABLE t (val real[]); -INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); -CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); - -INSERT INTO t (val) VALUES (array[1,2,4]); - -explain SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT * FROM t ORDER BY val <-> array[3,3,3]; -SELECT COUNT(*) FROM t; - -DROP TABLE t; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 2f63ee3acc..f362a45035 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -106,6 +106,7 @@ jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] } signature = "2" ecdsa = "0.16" p256 = { version = "0.13", features = ["jwk"] } +ed25519-dalek = { version = "2", default-features = false, features = ["rand_core"] } rsa = "0.9" workspace_hack.workspace = true diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 5f65b17374..d7ffff0483 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -187,10 +187,6 @@ pub async fn worker( let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx)); let rx = rx.map(RequestData::from); - let storage = GenericRemoteStorage::from_config(&remote_storage_config) - .await - .context("remote storage init")?; - let properties = WriterProperties::builder() .set_data_page_size_limit(config.parquet_upload_page_size) .set_compression(config.parquet_upload_compression); @@ -224,18 +220,18 @@ pub async fn worker( let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); let rx_disconnect = rx_disconnect.map(RequestData::from); - let storage_disconnect = - GenericRemoteStorage::from_config(&disconnect_events_storage_config) - .await - .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(storage, rx, parquet_config), - worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + worker_inner(remote_storage_config, rx, parquet_config), + worker_inner( + disconnect_events_storage_config, + rx_disconnect, + parquet_config_disconnect + ) ) .map(|_| ()) } else { - worker_inner(storage, rx, parquet_config).await + worker_inner(remote_storage_config, rx, parquet_config).await } } @@ -251,18 +247,32 @@ struct ParquetConfig { test_remote_failures: u64, } +impl ParquetConfig { + async fn storage( + &self, + storage_config: &RemoteStorageConfig, + ) -> anyhow::Result { + let storage = GenericRemoteStorage::from_config(storage_config) + .await + .context("remote storage init")?; + + #[cfg(any(test, feature = "testing"))] + if self.test_remote_failures > 0 { + return Ok(GenericRemoteStorage::unreliable_wrapper( + storage, + self.test_remote_failures, + )); + } + + Ok(storage) + } +} + async fn worker_inner( - storage: GenericRemoteStorage, + storage_config: RemoteStorageConfig, rx: impl Stream, config: ParquetConfig, ) -> anyhow::Result<()> { - #[cfg(any(test, feature = "testing"))] - let storage = if config.test_remote_failures > 0 { - GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures) - } else { - storage - }; - let mut rx = std::pin::pin!(rx); let mut rows = Vec::with_capacity(config.rows_per_group); @@ -285,7 +295,7 @@ async fn worker_inner( } if len > config.file_size || force { last_upload = time::Instant::now(); - let file = upload_parquet(w, len, &storage).await?; + let file = upload_parquet(w, len, &storage_config, &config).await?; w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; len = 0; } @@ -298,7 +308,7 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _rtchk: Writer = upload_parquet(w, len, &storage).await?; + let _rtchk: Writer = upload_parquet(w, len, &storage_config, &config).await?; } Ok(()) @@ -340,7 +350,8 @@ where async fn upload_parquet( mut w: SerializedFileWriter>, len: i64, - storage: &GenericRemoteStorage, + storage_config: &RemoteStorageConfig, + config: &ParquetConfig, ) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() @@ -377,6 +388,15 @@ async fn upload_parquet( size, compression, "uploading request parquet file" ); + // A bug in azure-sdk means that the identity-token-file that expires after + // 1 hour is not refreshed. This identity-token is used to fetch the actual azure storage + // tokens that last for 24 hours. After this 24 hour period, azure-sdk tries to refresh + // the storage token, but the identity token has now expired. + // + // + // To work around this, we recreate the storage every time. + let storage = config.storage(storage_config).await?; + let year = now.year(); let month = now.month(); let day = now.day(); @@ -431,8 +451,8 @@ mod tests { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use remote_storage::{ - GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, - DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use tokio::sync::mpsc; use tokio::time; @@ -559,12 +579,11 @@ mod tests { timeout: std::time::Duration::from_secs(120), small_timeout: std::time::Duration::from_secs(30), }; - let storage = GenericRemoteStorage::from_config(&remote_storage_config) + + worker_inner(remote_storage_config, rx, config) .await .unwrap(); - worker_inner(storage, rx, config).await.unwrap(); - let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() .filter_map(|entry| entry.ok()) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index b398c3ddd0..6d5fb13681 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -3,9 +3,9 @@ use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; +use ed25519_dalek::SigningKey; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use p256::ecdsa::SigningKey; -use p256::elliptic_curve::JwkEcKey; +use jose_jwk::jose_b64; use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; use tracing::field::display; @@ -354,9 +354,15 @@ impl PoolingBackend { } } -fn create_random_jwk() -> (SigningKey, JwkEcKey) { - let key = SigningKey::random(&mut OsRng); - let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); +fn create_random_jwk() -> (SigningKey, jose_jwk::Key) { + let key = SigningKey::generate(&mut OsRng); + + let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + crv: jose_jwk::OkpCurves::Ed25519, + x: jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + d: None, + }); + (key, jwk) } diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index c51a2bc9ba..fe33f0ff65 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -16,17 +16,16 @@ use std::sync::Arc; use std::task::{ready, Poll}; use std::time::Duration; +use ed25519_dalek::{Signature, Signer, SigningKey}; use futures::future::poll_fn; use futures::Future; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; -use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; use postgres_client::tls::NoTlsStream; use postgres_client::types::ToSql; use postgres_client::AsyncMessage; use serde_json::value::RawValue; -use signature::Signer; use tokio::net::TcpStream; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -42,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; pub(crate) const EXT_NAME: &str = "pg_session_jwt"; -pub(crate) const EXT_VERSION: &str = "0.1.2"; +pub(crate) const EXT_VERSION: &str = "0.2.0"; pub(crate) const EXT_SCHEMA: &str = "auth"; #[derive(Clone)] @@ -339,8 +338,8 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { let cap = jwt.capacity(); // we only need an empty header with the alg specified. - // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" - jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + // base64url(r#"{"alg":"EdDSA"}"#) == "eyJhbGciOiJFZERTQSJ9" + jwt.push_str("eyJhbGciOiJFZERTQSJ9."); // encode the jwt payload in-place base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); @@ -366,14 +365,14 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { - use p256::ecdsa::SigningKey; + use ed25519_dalek::SigningKey; use typed_json::json; use super::resign_jwt; #[test] fn jwt_token_snapshot() { - let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let key = SigningKey::from_bytes(&[1; 32]); let data = json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); @@ -381,12 +380,17 @@ mod tests { // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. // In the public-key box, paste the following jwk public key - // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + // `{"kty":"OKP","crv":"Ed25519","x":"iojj3XQJ8ZX9UtstPLpdcspnCb8dlBIb83SIAbQPb1w"}` + // Note - jwt.io doesn't support EdDSA :( + // https://github.com/jsonwebtoken/jsonwebtoken.github.io/issues/509 - // let pub_key = p256::ecdsa::VerifyingKey::from(&key); - // let pub_key = p256::PublicKey::from(pub_key); - // println!("{}", pub_key.to_jwk_string()); + // let jwk = jose_jwk::Key::Okp(jose_jwk::Okp { + // crv: jose_jwk::OkpCurves::Ed25519, + // x: jose_jwk::jose_b64::serde::Bytes::from(key.verifying_key().to_bytes().to_vec()), + // d: None, + // }); + // println!("{}", serde_json::to_string(&jwk).unwrap()); - assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + assert_eq!(jwt, "eyJhbGciOiJFZERTQSJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.Cvyc2By33KI0f0obystwdy8PN111L3Sc9_Mr2CU3XshtSqSdxuRxNEZGbb_RvyJf2IzheC_s7aBZ-jLeQ9N0Bg"); } } diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 996c4d9b8c..19c6662e74 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -21,14 +21,13 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; -/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. -/// This mirrors the configuration in bin/safekeeper.rs. +/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[allow(non_upper_case_globals)] #[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; // Register benchmarks with Criterion. criterion_group!( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 13f6e34575..bc7af02185 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,10 +51,12 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; -/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21). +/// This adds roughly 3% overhead for allocations on average, which is acceptable considering +/// performance-sensitive code will avoid allocations as far as possible anyway. #[allow(non_upper_case_globals)] #[export_name = "malloc_conf"] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 5f3319512d..caaa22d0a5 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -55,4 +55,4 @@ r2d2 = { version = "0.8.10" } utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } -workspace_hack = { version = "0.1", path = "../workspace_hack" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } \ No newline at end of file diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql new file mode 100644 index 0000000000..c2624f858b --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/down.sql @@ -0,0 +1,4 @@ +-- this sadly isn't a "true" revert of the migration, as the column is now at the end of the table. +-- But preserving order is not a trivial operation. +-- https://wiki.postgresql.org/wiki/Alter_column_position +ALTER TABLE safekeepers ADD active BOOLEAN NOT NULL DEFAULT false; diff --git a/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql new file mode 100644 index 0000000000..d76f044eda --- /dev/null +++ b/storage_controller/migrations/2025-01-09-160454_safekeepers_remove_active/up.sql @@ -0,0 +1 @@ +ALTER TABLE safekeepers DROP active; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 69db48f8d1..3884a6df46 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -124,7 +124,10 @@ impl ComputeHookTenant { if let Some(shard_idx) = shard_idx { sharded.shards.remove(shard_idx); } else { - tracing::warn!("Shard not found while handling detach") + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Shard not found while handling detach") } } ComputeHookTenant::Unsharded(_) => { @@ -761,7 +764,10 @@ impl ComputeHook { let mut state_locked = self.state.lock().unwrap(); match state_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { - tracing::warn!("Compute hook tenant not found for detach"); + // This is a valid but niche case, where the tenant was previously attached + // as a Secondary location and then detached, so has no previously notified + // state. + tracing::info!("Compute hook tenant not found for detach"); } Entry::Occupied(mut e) => { let sharded = e.get().is_sharded(); diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index 47f4276ff2..8b7be88078 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -112,7 +112,7 @@ impl TenantShardDrain { } } - match scheduler.node_preferred(tenant_shard.intent.get_secondary()) { + match tenant_shard.preferred_secondary(scheduler) { Some(node) => Some(node), None => { tracing::warn!( diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 5385e4ee0b..c8df4ffe28 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -690,7 +690,8 @@ async fn handle_node_list(req: Request) -> Result, ApiError }; let state = get_state(&req); - let nodes = state.service.node_list().await?; + let mut nodes = state.service.node_list().await?; + nodes.sort_by_key(|n| n.get_id()); let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); json_response(StatusCode::OK, api_nodes) @@ -1005,6 +1006,29 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_shard_migrate_secondary( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_shard_migrate_secondary(tenant_shard_id, migrate_req) + .await?, + ) +} + async fn handle_tenant_shard_cancel_reconcile( service: Arc, req: Request, @@ -1855,6 +1879,16 @@ pub fn make_router( RequestName("control_v1_tenant_migrate"), ) }) + .put( + "/control/v1/tenant/:tenant_shard_id/migrate_secondary", + |r| { + tenant_service_handler( + r, + handle_tenant_shard_migrate_secondary, + RequestName("control_v1_tenant_migrate_secondary"), + ) + }, + ) .put( "/control/v1/tenant/:tenant_shard_id/cancel_reconcile", |r| { diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 6d5885eba6..4164e3dc2b 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -53,6 +53,16 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards are not scheduled into their preferred AZ pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + /// How many shard locations (secondary or attached) on each node + pub(crate) storage_controller_node_shards: measured::GaugeVec, + + /// How many _attached_ shard locations on each node + pub(crate) storage_controller_node_attached_shards: measured::GaugeVec, + + /// How many _home_ shard locations on each node (i.e. the node's AZ matches the shard's + /// preferred AZ) + pub(crate) storage_controller_node_home_shards: measured::GaugeVec, + /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, @@ -132,6 +142,15 @@ impl Default for StorageControllerMetrics { } } +#[derive(measured::LabelGroup, Clone)] +#[label(set = NodeLabelGroupSet)] +pub(crate) struct NodeLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) az: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) node_id: &'a str, +} + #[derive(measured::LabelGroup)] #[label(set = ReconcileCompleteLabelGroupSet)] pub(crate) struct ReconcileCompleteLabelGroup { diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4cc9b0070d..f5c2d329e0 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -299,6 +299,7 @@ impl Node { id: self.id, availability: self.availability.clone().into(), scheduling: self.scheduling, + availability_zone_id: self.availability_zone_id.0.clone(), listen_http_addr: self.listen_http_addr.clone(), listen_http_port: self.listen_http_port, listen_pg_addr: self.listen_pg_addr.clone(), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index cebf3e9594..eb0bfc879e 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -708,10 +708,11 @@ impl Persistence { Ok(()) } + /// Note that passing None for a shard clears the preferred AZ (rather than leaving it unmodified) pub(crate) async fn set_tenant_shard_preferred_azs( &self, - preferred_azs: Vec<(TenantShardId, AvailabilityZone)>, - ) -> DatabaseResult> { + preferred_azs: Vec<(TenantShardId, Option)>, + ) -> DatabaseResult)>> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| { @@ -722,7 +723,7 @@ impl Persistence { .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) - .set(preferred_az_id.eq(preferred_az.0.clone())) + .set(preferred_az_id.eq(preferred_az.as_ref().map(|az| az.0.clone()))) .execute(conn)?; if updated == 1 { @@ -1258,7 +1259,6 @@ pub(crate) struct SafekeeperPersistence { pub(crate) version: i64, pub(crate) host: String, pub(crate) port: i32, - pub(crate) active: bool, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, pub(crate) scheduling_policy: String, @@ -1270,7 +1270,6 @@ impl SafekeeperPersistence { SkSchedulingPolicy::from_str(&self.scheduling_policy).map_err(|e| { DatabaseError::Logical(format!("can't construct SkSchedulingPolicy: {e:?}")) })?; - // omit the `active` flag on purpose: it is deprecated. Ok(SafekeeperDescribeResponse { id: NodeId(self.id as u64), region_id: self.region_id.clone(), @@ -1295,7 +1294,8 @@ pub(crate) struct SafekeeperUpsert { pub(crate) version: i64, pub(crate) host: String, pub(crate) port: i32, - pub(crate) active: bool, + /// The active flag will not be stored in the database and will be ignored. + pub(crate) active: Option, pub(crate) http_port: i32, pub(crate) availability_zone_id: String, } @@ -1311,7 +1311,6 @@ impl SafekeeperUpsert { version: self.version, host: &self.host, port: self.port, - active: self.active, http_port: self.http_port, availability_zone_id: &self.availability_zone_id, // None means a wish to not update this column. We expose abilities to update it via other means. @@ -1328,7 +1327,6 @@ struct InsertUpdateSafekeeper<'a> { version: i64, host: &'a str, port: i32, - active: bool, http_port: i32, availability_zone_id: &'a str, scheduling_policy: Option<&'a str>, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index e0a854fff7..adced3b77d 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -696,6 +696,11 @@ impl Reconciler { /// First we apply special case handling (e.g. for live migrations), and then a /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. + /// + /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that + /// all locations for the tenant are in the expected state. When nodes that are to be detached + /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a + /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it self.maybe_refresh_observed().await?; @@ -784,10 +789,18 @@ impl Reconciler { tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); - changes.push((node.clone(), wanted_conf)) + // Only try and configure secondary locations on nodes that are available. This + // allows the reconciler to "succeed" while some secondaries are offline (e.g. after + // a node failure, where the failed node will have a secondary intent) + if node.is_available() { + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } else { + tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } } } @@ -813,7 +826,21 @@ impl Reconciler { if self.cancel.is_cancelled() { return Err(ReconcileError::Cancel); } - self.location_config(&node, conf, None, false).await?; + // We only try to configure secondary locations if the node is available. This does + // not stop us succeeding with the reconcile, because our core goal is to make the + // shard _available_ (the attached location), and configuring secondary locations + // can be done lazily when the node becomes available (via background reconciliation). + if node.is_available() { + self.location_config(&node, conf, None, false).await?; + } else { + // If the node is unavailable, we skip and consider the reconciliation successful: this + // is a common case where a pageserver is marked unavailable: we demote a location on + // that unavailable pageserver to secondary. + tracing::info!("Skipping configuring secondary location {node}, it is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } // The condition below identifies a detach. We must have no attached intent and diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 51a4cf35be..f5cab9dd57 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_shard::TenantShard}; +use crate::{metrics::NodeLabelGroup, node::Node, tenant_shard::TenantShard}; use itertools::Itertools; use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization}; use serde::Serialize; @@ -32,6 +32,9 @@ pub(crate) struct SchedulerNode { shard_count: usize, /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. attached_shard_count: usize, + /// How many shards have a location on this node (via [`crate::tenant_shard::IntentState`]) _and_ this node + /// is in their preferred AZ (i.e. this is their 'home' location) + home_shard_count: usize, /// Availability zone id in which the node resides az: AvailabilityZone, @@ -47,6 +50,12 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized { preferred_az: &Option, context: &ScheduleContext, ) -> Option; + + /// Return a score that drops any components based on node utilization: this is useful + /// for finding scores for scheduling optimisation, when we want to avoid rescheduling + /// shards due to e.g. disk usage, to avoid flapping. + fn for_optimization(&self) -> Self; + fn is_overloaded(&self) -> bool; fn node_id(&self) -> NodeId; } @@ -136,17 +145,13 @@ impl PartialOrd for SecondaryAzMatch { /// Ordering is given by member declaration order (top to bottom). #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] pub(crate) struct NodeAttachmentSchedulingScore { - /// The number of shards belonging to the tenant currently being - /// scheduled that are attached to this node. - affinity_score: AffinityScore, /// Flag indicating whether this node matches the preferred AZ /// of the shard. For equal affinity scores, nodes in the matching AZ /// are considered first. az_match: AttachmentAzMatch, - /// Size of [`ScheduleContext::attached_nodes`] for the current node. - /// This normally tracks the number of attached shards belonging to the - /// tenant being scheduled that are already on this node. - attached_shards_in_context: usize, + /// The number of shards belonging to the tenant currently being + /// scheduled that are attached to this node. + affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, /// Total number of shards attached to this node. When nodes have identical utilisation, this @@ -177,13 +182,25 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), - attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0), utilization_score: utilization.cached_score(), total_attached_shard_count: node.attached_shard_count, node_id: *node_id, }) } + /// For use in scheduling optimisation, where we only want to consider the aspects + /// of the score that can only be resolved by moving things (such as inter-shard affinity + /// and AZ affinity), and ignore aspects that reflect the total utilization of a node (which + /// can fluctuate for other reasons) + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_attached_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -208,9 +225,9 @@ pub(crate) struct NodeSecondarySchedulingScore { affinity_score: AffinityScore, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, - /// Total number of shards attached to this node. When nodes have identical utilisation, this - /// acts as an anti-affinity between attached shards. - total_attached_shard_count: usize, + /// Anti-affinity with other non-home locations: this gives the behavior that secondaries + /// will spread out across the nodes in an AZ. + total_non_home_shard_count: usize, /// Convenience to make selection deterministic in tests and empty systems node_id: NodeId, } @@ -237,11 +254,20 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { .copied() .unwrap_or(AffinityScore::FREE), utilization_score: utilization.cached_score(), - total_attached_shard_count: node.attached_shard_count, + total_non_home_shard_count: (node.shard_count - node.home_shard_count), node_id: *node_id, }) } + fn for_optimization(&self) -> Self { + Self { + utilization_score: 0, + total_non_home_shard_count: 0, + node_id: NodeId(0), + ..*self + } + } + fn is_overloaded(&self) -> bool { PageserverUtilization::is_overloaded(self.utilization_score) } @@ -293,6 +319,10 @@ impl AffinityScore { pub(crate) fn inc(&mut self) { self.0 += 1; } + + pub(crate) fn dec(&mut self) { + self.0 -= 1; + } } impl std::ops::Add for AffinityScore { @@ -324,9 +354,6 @@ pub(crate) struct ScheduleContext { /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] pub(crate) nodes: HashMap, - /// Specifically how many _attached_ locations are on each node - pub(crate) attached_nodes: HashMap, - pub(crate) mode: ScheduleMode, } @@ -334,7 +361,6 @@ impl ScheduleContext { pub(crate) fn new(mode: ScheduleMode) -> Self { Self { nodes: HashMap::new(), - attached_nodes: HashMap::new(), mode, } } @@ -348,25 +374,31 @@ impl ScheduleContext { } } - pub(crate) fn push_attached(&mut self, node_id: NodeId) { - let entry = self.attached_nodes.entry(node_id).or_default(); - *entry += 1; - } - - pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { - self.nodes - .get(&node_id) - .copied() - .unwrap_or(AffinityScore::FREE) - } - - pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { - self.attached_nodes.get(&node_id).copied().unwrap_or(0) + /// Remove `shard`'s contributions to this context. This is useful when considering scheduling + /// this shard afresh, where we don't want it to e.g. experience anti-affinity to its current location. + pub(crate) fn project_detach(&self, shard: &TenantShard) -> Self { + let mut new_context = self.clone(); + + if let Some(attached) = shard.intent.get_attached() { + if let Some(score) = new_context.nodes.get_mut(attached) { + score.dec(); + } + } + + for secondary in shard.intent.get_secondary() { + if let Some(score) = new_context.nodes.get_mut(secondary) { + score.dec(); + } + } + + new_context } + /// For test, track the sum of AffinityScore values, which is effectively how many + /// attached or secondary locations have been registered with this context. #[cfg(test)] - pub(crate) fn attach_count(&self) -> usize { - self.attached_nodes.values().sum() + pub(crate) fn location_count(&self) -> usize { + self.nodes.values().map(|i| i.0).sum() } } @@ -388,6 +420,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -415,6 +448,7 @@ impl Scheduler { SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }, @@ -427,6 +461,9 @@ impl Scheduler { Some(node) => { node.shard_count += 1; node.attached_shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } } None => anyhow::bail!( "Tenant {} references nonexistent node {}", @@ -438,7 +475,12 @@ impl Scheduler { for node_id in shard.intent.get_secondary() { match expect_nodes.get_mut(node_id) { - Some(node) => node.shard_count += 1, + Some(node) => { + node.shard_count += 1; + if Some(&node.az) == shard.preferred_az() { + node.home_shard_count += 1; + } + } None => anyhow::bail!( "Tenant {} references nonexistent node {}", shard.tenant_shard_id, @@ -482,13 +524,20 @@ impl Scheduler { /// /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into /// [`Self::new`] or [`Self::node_upsert`]) - pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { + pub(crate) fn update_node_ref_counts( + &mut self, + node_id: NodeId, + preferred_az: Option<&AvailabilityZone>, + update: RefCountUpdate, + ) { let Some(node) = self.nodes.get_mut(&node_id) else { debug_assert!(false); tracing::error!("Scheduler missing node {node_id}"); return; }; + let is_home_az = Some(&node.az) == preferred_az; + match update { RefCountUpdate::PromoteSecondary => { node.attached_shard_count += 1; @@ -496,19 +545,31 @@ impl Scheduler { RefCountUpdate::Attach => { node.shard_count += 1; node.attached_shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::Detach => { node.shard_count -= 1; node.attached_shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } RefCountUpdate::DemoteAttached => { node.attached_shard_count -= 1; } RefCountUpdate::AddSecondary => { node.shard_count += 1; + if is_home_az { + node.home_shard_count += 1; + } } RefCountUpdate::RemoveSecondary => { node.shard_count -= 1; + if is_home_az { + node.home_shard_count -= 1; + } } } @@ -594,6 +655,7 @@ impl Scheduler { entry.insert(SchedulerNode { shard_count: 0, attached_shard_count: 0, + home_shard_count: 0, may_schedule: node.may_schedule(), az: node.get_availability_zone_id().clone(), }); @@ -607,33 +669,20 @@ impl Scheduler { } } - /// Where we have several nodes to choose from, for example when picking a secondary location - /// to promote to an attached location, this method may be used to pick the best choice based - /// on the scheduler's knowledge of utilization and availability. - /// - /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the - /// caller can pick a node some other way. - pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { - if nodes.is_empty() { - return None; - } - - // TODO: When the utilization score returned by the pageserver becomes meaningful, - // schedule based on that instead of the shard count. - let node = nodes - .iter() - .map(|node_id| { - let may_schedule = self - .nodes - .get(node_id) - .map(|n| !matches!(n.may_schedule, MaySchedule::No)) - .unwrap_or(false); - (*node_id, may_schedule) - }) - .max_by_key(|(_n, may_schedule)| *may_schedule); - - // If even the preferred node has may_schedule==false, return None - node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + /// Calculate a single node's score, used in optimizer logic to compare specific + /// nodes' scores. + pub(crate) fn compute_node_score( + &mut self, + node_id: NodeId, + preferred_az: &Option, + context: &ScheduleContext, + ) -> Option + where + Score: NodeSchedulingScore, + { + self.nodes + .get_mut(&node_id) + .and_then(|node| Score::generate(&node_id, node, preferred_az, context)) } /// Compute a schedulling score for each node that the scheduler knows of @@ -727,7 +776,7 @@ impl Scheduler { tracing::info!( "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", scores.iter().map(|i| i.node_id().0).collect::>() - ); + ); } // Note that we do not update shard count here to reflect the scheduling: that @@ -743,47 +792,74 @@ impl Scheduler { } /// For choosing which AZ to schedule a new shard into, use this. It will return the - /// AZ with the lowest median utilization. + /// AZ with the the lowest number of shards currently scheduled in this AZ as their home + /// location. /// /// We use an AZ-wide measure rather than simply selecting the AZ of the least-loaded /// node, because while tenants start out single sharded, when they grow and undergo - /// shard-split, they will occupy space on many nodes within an AZ. + /// shard-split, they will occupy space on many nodes within an AZ. It is important + /// that we pick the AZ in a way that balances this _future_ load. /// - /// We use median rather than total free space or mean utilization, because - /// we wish to avoid preferring AZs that have low-load nodes resulting from - /// recent replacements. - /// - /// The practical result is that we will pick an AZ based on its median node, and - /// then actually _schedule_ the new shard onto the lowest-loaded node in that AZ. + /// Once we've picked an AZ, subsequent scheduling within that AZ will be driven by + /// nodes' utilization scores. pub(crate) fn get_az_for_new_tenant(&self) -> Option { if self.nodes.is_empty() { return None; } - let mut scores_by_az = HashMap::new(); - for (node_id, node) in &self.nodes { - let az_scores = scores_by_az.entry(&node.az).or_insert_with(Vec::new); - let score = match &node.may_schedule { - MaySchedule::Yes(utilization) => utilization.score(), - MaySchedule::No => PageserverUtilization::full().score(), - }; - az_scores.push((node_id, node, score)); + #[derive(Default)] + struct AzScore { + home_shard_count: usize, + scheduleable: bool, } - // Sort by utilization. Also include the node ID to break ties. - for scores in scores_by_az.values_mut() { - scores.sort_by_key(|i| (i.2, i.0)); + let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); + for node in self.nodes.values() { + let az = azs.entry(&node.az).or_default(); + az.home_shard_count += node.home_shard_count; + az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); } - let mut median_by_az = scores_by_az + // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where + // all nodes are overloaded or otherwise unschedulable). + if azs.values().any(|i| i.scheduleable) { + azs.retain(|_, i| i.scheduleable); + } + + // Find the AZ with the lowest number of shards currently allocated + Some( + azs.into_iter() + .min_by_key(|i| (i.1.home_shard_count, i.0)) + .unwrap() + .0 + .clone(), + ) + } + + pub(crate) fn get_node_az(&self, node_id: &NodeId) -> Option { + self.nodes.get(node_id).map(|n| n.az.clone()) + } + + /// For use when choosing a preferred secondary location: filter out nodes that are not + /// available, and gather their AZs. + pub(crate) fn filter_usable_nodes( + &self, + nodes: &[NodeId], + ) -> Vec<(NodeId, Option)> { + nodes .iter() - .map(|(az, nodes)| (*az, nodes.get(nodes.len() / 2).unwrap().2)) - .collect::>(); - // Sort by utilization. Also include the AZ to break ties. - median_by_az.sort_by_key(|i| (i.1, i.0)); - - // Return the AZ with the lowest median utilization - Some(median_by_az.first().unwrap().0.clone()) + .filter_map(|node_id| { + let node = self + .nodes + .get(node_id) + .expect("Referenced nodes always exist"); + if matches!(node.may_schedule, MaySchedule::Yes(_)) { + Some((*node_id, Some(node.az.clone()))) + } else { + None + } + }) + .collect() } /// Unit test access to internal state @@ -796,6 +872,33 @@ impl Scheduler { pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { self.nodes.get(&node_id).unwrap().attached_shard_count } + + /// Some metrics that we only calculate periodically: this is simpler than + /// rigorously updating them on every change. + pub(crate) fn update_metrics(&self) { + for (node_id, node) in &self.nodes { + let node_id_str = format!("{}", node_id); + let label_group = NodeLabelGroup { + az: &node.az.0, + node_id: &node_id_str, + }; + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_shards + .set(label_group.clone(), node.shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_attached_shards + .set(label_group.clone(), node.attached_shard_count as i64); + + crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_node_home_shards + .set(label_group.clone(), node.home_shard_count as i64); + } + } } #[cfg(test)] @@ -843,7 +946,14 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::{ + controller_api::NodeAvailability, models::utilization::test_utilization, + shard::ShardIdentity, + }; + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber, TenantShardId}, + }; use super::*; @@ -853,8 +963,8 @@ mod tests { let nodes = test_utils::make_test_nodes(2, &[]); let mut scheduler = Scheduler::new(nodes.values()); - let mut t1_intent = IntentState::new(); - let mut t2_intent = IntentState::new(); + let mut t1_intent = IntentState::new(None); + let mut t2_intent = IntentState::new(None); let context = ScheduleContext::default(); @@ -930,7 +1040,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &None, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(None); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1063,7 +1173,7 @@ mod tests { let scheduled = scheduler .schedule_shard::(&[], &preferred_az, context) .unwrap(); - let mut intent = IntentState::new(); + let mut intent = IntentState::new(preferred_az.clone()); intent.set_attached(scheduler, Some(scheduled)); scheduled_intents.push(intent); assert_eq!(scheduled, expect_node); @@ -1089,9 +1199,9 @@ mod tests { &mut context, ); - // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that. + // Node 1 and 3 (az-a) have same affinity score, so prefer the lowest node id. assert_scheduler_chooses::( - NodeId(2), + NodeId(1), Some(az_a_tag.clone()), &mut scheduled_intents, &mut scheduler, @@ -1107,26 +1217,6 @@ mod tests { &mut context, ); - // Avoid nodes in "az-b" for the secondary location. - // Nodes 1 and 3 are identically loaded, so prefer the lowest node id. - assert_scheduler_chooses::( - NodeId(1), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - - // Avoid nodes in "az-b" for the secondary location. - // Node 3 has lower affinity score than 1, so prefer that. - assert_scheduler_chooses::( - NodeId(3), - Some(az_b_tag.clone()), - &mut scheduled_intents, - &mut scheduler, - &mut context, - ); - for mut intent in scheduled_intents { intent.clear(&mut scheduler); } @@ -1150,34 +1240,292 @@ mod tests { let mut scheduler = Scheduler::new(nodes.values()); - /// Force the utilization of a node in Scheduler's state to a particular - /// number of bytes used. - fn set_utilization(scheduler: &mut Scheduler, node_id: NodeId, shard_count: u32) { - let mut node = Node::new( - node_id, - "".to_string(), - 0, - "".to_string(), - 0, - scheduler.nodes.get(&node_id).unwrap().az.clone(), - ); - node.set_availability(NodeAvailability::Active(test_utilization::simple( - shard_count, - 0, - ))); - scheduler.node_upsert(&node); + /// Force the `home_shard_count` of a node directly: this is the metric used + /// by the scheduler when picking AZs. + fn set_shard_count(scheduler: &mut Scheduler, node_id: NodeId, shard_count: usize) { + let node = scheduler.nodes.get_mut(&node_id).unwrap(); + node.home_shard_count = shard_count; } // Initial empty state. Scores are tied, scheduler prefers lower AZ ID. assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); - // Put some utilization on one node in AZ A: this should change nothing, as the median hasn't changed - set_utilization(&mut scheduler, NodeId(1), 1000000); - assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); - - // Put some utilization on a second node in AZ A: now the median has changed, so the scheduler - // should prefer the other AZ. - set_utilization(&mut scheduler, NodeId(2), 1000000); + // Home shard count is higher in AZ A, so AZ B will be preferred + set_shard_count(&mut scheduler, NodeId(1), 10); assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_b_tag.clone())); + + // Total home shard count is higher in AZ B, so we revert to preferring AZ A + set_shard_count(&mut scheduler, NodeId(4), 6); + set_shard_count(&mut scheduler, NodeId(5), 6); + assert_eq!(scheduler.get_az_for_new_tenant(), Some(az_a_tag.clone())); + } + + /// Test that when selecting AZs for many new tenants, we get the expected balance across nodes + #[test] + fn az_selection_many() { + let az_a_tag = AvailabilityZone("az-a".to_string()); + let az_b_tag = AvailabilityZone("az-b".to_string()); + let az_c_tag = AvailabilityZone("az-c".to_string()); + let nodes = test_utils::make_test_nodes( + 6, + &[ + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + az_a_tag.clone(), + az_b_tag.clone(), + az_c_tag.clone(), + ], + ); + + let mut scheduler = Scheduler::new(nodes.values()); + + // We should get 1/6th of these on each node, give or take a few... + let total_tenants = 300; + + // ...where the 'few' is the number of AZs, because the scheduling will sometimes overshoot + // on one AZ before correcting itself. This is because we select the 'home' AZ based on + // an AZ-wide metric, but we select the location for secondaries on a purely node-based + // metric (while excluding the home AZ). + let grace = 3; + + let mut scheduled_shards = Vec::new(); + for _i in 0..total_tenants { + let preferred_az = scheduler.get_az_for_new_tenant().unwrap(); + + let mut node_home_counts = scheduler + .nodes + .iter() + .map(|(node_id, node)| (node_id, node.home_shard_count)) + .collect::>(); + node_home_counts.sort_by_key(|i| i.0); + eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts); + + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::generate(), + shard_number: ShardNumber(0), + shard_count: ShardCount(1), + }; + + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + Some(preferred_az), + ); + + let mut context = ScheduleContext::default(); + shard.schedule(&mut scheduler, &mut context).unwrap(); + eprintln!("Scheduled shard at {:?}", shard.intent); + + scheduled_shards.push(shard); + } + + for (node_id, node) in &scheduler.nodes { + eprintln!( + "Node {}: {} {} {}", + node_id, node.shard_count, node.attached_shard_count, node.home_shard_count + ); + } + + for node in scheduler.nodes.values() { + assert!((node.home_shard_count as i64 - total_tenants as i64 / 6).abs() < grace); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } + + #[test] + /// Make sure that when we have an odd number of nodes and an even number of shards, we still + /// get scheduling stability. + fn odd_nodes_stability() { + let az_a = AvailabilityZone("az-a".to_string()); + let az_b = AvailabilityZone("az-b".to_string()); + + let nodes = test_utils::make_test_nodes( + 10, + &[ + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_a.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + az_b.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_shards = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn schedule_shard( + tenant_shard_id: TenantShardId, + expect_attached: NodeId, + expect_secondary: NodeId, + scheduled_shards: &mut Vec, + scheduler: &mut Scheduler, + preferred_az: Option, + context: &mut ScheduleContext, + ) { + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + preferred_az, + ); + + shard.schedule(scheduler, context).unwrap(); + + assert_eq!(shard.intent.get_attached().unwrap(), expect_attached); + assert_eq!( + shard.intent.get_secondary().first().unwrap(), + &expect_secondary + ); + + scheduled_shards.push(shard); + } + + let tenant_id = TenantId::generate(); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(1), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(2), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(3), + shard_count: ShardCount(8), + }, + NodeId(4), + NodeId(9), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(4), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(10), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(5), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(6), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(6), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(7), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(7), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(8), + &mut scheduled_shards, + &mut scheduler, + Some(az_a.clone()), + &mut context, + ); + + // Assert that the optimizer suggests nochanges, i.e. our initial scheduling was stable. + for shard in &scheduled_shards { + assert_eq!(shard.optimize_attachment(&mut scheduler, &context), None); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } } } diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 44c91619ab..14c30c296d 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -36,7 +36,6 @@ diesel::table! { version -> Int8, host -> Text, port -> Int4, - active -> Bool, http_port -> Int4, availability_zone_id -> Text, scheduling_policy -> Varchar, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 265b2798d2..cbb9103880 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1404,7 +1404,11 @@ impl Service { // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. - let mut intent = IntentState::new(); + let mut intent = IntentState::new( + tsp.preferred_az_id + .as_ref() + .map(|az| AvailabilityZone(az.clone())), + ); if let Some(generation_pageserver) = tsp.generation_pageserver.map(|n| NodeId(n as u64)) { if nodes.contains_key(&generation_pageserver) { @@ -2474,18 +2478,29 @@ impl Service { tenant_id: TenantId, _guard: &TracingExclusiveGuard, ) -> Result<(), ApiError> { - let present_in_memory = { + // Check if the tenant is present in memory, and select an AZ to use when loading + // if we will load it. + let load_in_az = { let locked = self.inner.read().unwrap(); - locked + let existing = locked .tenants .range(TenantShardId::tenant_range(tenant_id)) - .next() - .is_some() - }; + .next(); - if present_in_memory { - return Ok(()); - } + // If the tenant is not present in memory, we expect to load it from database, + // so let's figure out what AZ to load it into while we have self.inner locked. + if existing.is_none() { + locked + .scheduler + .get_az_for_new_tenant() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "No AZ with nodes found to load tenant" + )))? + } else { + // We already have this tenant in memory + return Ok(()); + } + }; let tenant_shards = self.persistence.load_tenant(tenant_id).await?; if tenant_shards.is_empty() { @@ -2494,8 +2509,20 @@ impl Service { )); } - // TODO: choose a fresh AZ to use for this tenant when un-detaching: there definitely isn't a running - // compute, so no benefit to making AZ sticky across detaches. + // Update the persistent shards with the AZ that we are about to apply to in-memory state + self.persistence + .set_tenant_shard_preferred_azs( + tenant_shards + .iter() + .map(|t| { + ( + t.get_tenant_shard_id().expect("Corrupt shard in database"), + Some(load_in_az.clone()), + ) + }) + .collect(), + ) + .await?; let mut locked = self.inner.write().unwrap(); tracing::info!( @@ -2505,7 +2532,7 @@ impl Service { ); locked.tenants.extend(tenant_shards.into_iter().map(|p| { - let intent = IntentState::new(); + let intent = IntentState::new(Some(load_in_az.clone())); let shard = TenantShard::from_persistent(p, intent).expect("Corrupt shard row in database"); @@ -4236,6 +4263,22 @@ impl Service { } tracing::info!("Restoring parent shard {tenant_shard_id}"); + + // Drop any intents that refer to unavailable nodes, to enable this abort to proceed even + // if the original attachment location is offline. + if let Some(node_id) = shard.intent.get_attached() { + if !nodes.get(node_id).unwrap().is_available() { + tracing::info!("Demoting attached intent for {tenant_shard_id} on unavailable node {node_id}"); + shard.intent.demote_attached(scheduler, *node_id); + } + } + for node_id in shard.intent.get_secondary().clone() { + if !nodes.get(&node_id).unwrap().is_available() { + tracing::info!("Dropping secondary intent for {tenant_shard_id} on unavailable node {node_id}"); + shard.intent.remove_secondary(scheduler, node_id); + } + } + shard.splitting = SplitState::Idle; if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { // If this shard can't be scheduled now (perhaps due to offline nodes or @@ -4389,15 +4432,13 @@ impl Service { let mut child_state = TenantShard::new(child, child_shard, policy.clone(), preferred_az.clone()); - child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.intent = + IntentState::single(scheduler, Some(pageserver), preferred_az.clone()); child_state.observed = ObservedState { locations: child_observed, }; child_state.generation = Some(generation); child_state.config = config.clone(); - if let Some(preferred_az) = &preferred_az { - child_state.set_preferred_az(preferred_az.clone()); - } // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: @@ -5014,6 +5055,8 @@ impl Service { // If our new attached node was a secondary, it no longer should be. shard.intent.remove_secondary(scheduler, migrate_req.node_id); + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { if n > 0 { @@ -5025,8 +5068,6 @@ impl Service { shard.intent.push_secondary(scheduler, old_attached); } } - - shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); } PlacementPolicy::Secondary => { shard.intent.clear(scheduler); @@ -5055,6 +5096,69 @@ impl Service { Ok(TenantShardMigrateResponse {}) } + pub(crate) async fn tenant_shard_migrate_secondary( + &self, + tenant_shard_id: TenantShardId, + migrate_req: TenantShardMigrateRequest, + ) -> Result { + let waiter = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if shard.intent.get_secondary().len() == 1 + && shard.intent.get_secondary()[0] == migrate_req.node_id + { + tracing::info!( + "Migrating secondary to {node}: intent is unchanged {:?}", + shard.intent + ); + } else if shard.intent.get_attached() == &Some(migrate_req.node_id) { + tracing::info!("Migrating secondary to {node}: already attached where we were asked to create a secondary"); + } else { + let old_secondaries = shard.intent.get_secondary().clone(); + for secondary in old_secondaries { + shard.intent.remove_secondary(scheduler, secondary); + } + + shard.intent.push_secondary(scheduler, migrate_req.node_id); + shard.sequence = shard.sequence.next(); + tracing::info!( + "Migrating secondary to {node}: new intent {:?}", + shard.intent + ); + } + + self.maybe_reconcile_shard(shard, nodes) + }; + + if let Some(waiter) = waiter { + waiter.wait_timeout(RECONCILE_TIMEOUT).await?; + } else { + tracing::info!("Migration is a no-op"); + } + + Ok(TenantShardMigrateResponse {}) + } + /// 'cancel' in this context means cancel any ongoing reconcile pub(crate) async fn tenant_shard_cancel_reconcile( &self, @@ -5256,7 +5360,8 @@ impl Service { expect_nodes.sort_by_key(|n| n.node_id); nodes.sort_by_key(|n| n.node_id); - if nodes != expect_nodes { + // Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error + let node_result = if nodes != expect_nodes { tracing::error!("Consistency check failed on nodes."); tracing::error!( "Nodes in memory: {}", @@ -5268,10 +5373,12 @@ impl Service { serde_json::to_string(&nodes) .map_err(|e| ApiError::InternalServerError(e.into()))? ); - return Err(ApiError::InternalServerError(anyhow::anyhow!( + Err(ApiError::InternalServerError(anyhow::anyhow!( "Node consistency failure" - ))); - } + ))) + } else { + Ok(()) + }; let mut persistent_shards = self.persistence.load_active_tenant_shards().await?; persistent_shards @@ -5281,6 +5388,7 @@ impl Service { if persistent_shards != expect_shards { tracing::error!("Consistency check failed on shards."); + tracing::error!( "Shards in memory: {}", serde_json::to_string(&expect_shards) @@ -5291,12 +5399,57 @@ impl Service { serde_json::to_string(&persistent_shards) .map_err(|e| ApiError::InternalServerError(e.into()))? ); + + // The total dump log lines above are useful in testing but in the field grafana will + // usually just drop them because they're so large. So we also do some explicit logging + // of just the diffs. + let persistent_shards = persistent_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + let expect_shards = expect_shards + .into_iter() + .map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp)) + .collect::>(); + for (tenant_shard_id, persistent_tsp) in &persistent_shards { + match expect_shards.get(tenant_shard_id) { + None => { + tracing::error!( + "Shard {} found in database but not in memory", + tenant_shard_id + ); + } + Some(expect_tsp) => { + if expect_tsp != persistent_tsp { + tracing::error!( + "Shard {} is inconsistent. In memory: {}, database has: {}", + tenant_shard_id, + serde_json::to_string(expect_tsp).unwrap(), + serde_json::to_string(&persistent_tsp).unwrap() + ); + } + } + } + } + + // Having already logged any differences, log any shards that simply aren't present in the database + for (tenant_shard_id, memory_tsp) in &expect_shards { + if !persistent_shards.contains_key(tenant_shard_id) { + tracing::error!( + "Shard {} found in memory but not in database: {}", + tenant_shard_id, + serde_json::to_string(memory_tsp) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + } + } + return Err(ApiError::InternalServerError(anyhow::anyhow!( "Shard consistency failure" ))); } - Ok(()) + node_result } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that @@ -5600,7 +5753,7 @@ impl Service { register_req.listen_http_port, register_req.listen_pg_addr, register_req.listen_pg_port, - register_req.availability_zone_id, + register_req.availability_zone_id.clone(), ); // TODO: idempotency if the node already exists in the database @@ -5620,8 +5773,9 @@ impl Service { .set(locked.nodes.len() as i64); tracing::info!( - "Registered pageserver {}, now have {} pageservers", + "Registered pageserver {} ({}), now have {} pageservers", register_req.node_id, + register_req.availability_zone_id, locked.nodes.len() ); Ok(()) @@ -6236,7 +6390,7 @@ impl Service { /// available. A return value of 0 indicates that everything is fully reconciled already. fn reconcile_all(&self) -> usize { let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); // This function is an efficient place to update lazy statistics, since we are walking @@ -6297,6 +6451,9 @@ impl Service { } } + // Some metrics are calculated from SchedulerNode state, update these periodically + scheduler.update_metrics(); + // Process any deferred tenant drops for (tenant_id, guard) in drop_detached_tenants { self.maybe_drop_tenant(tenant_id, &mut locked, &guard); @@ -6355,6 +6512,7 @@ impl Service { // Shard was dropped between planning and execution; continue; }; + tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}"); if shard.apply_optimization(scheduler, optimization) { optimizations_applied += 1; if self.maybe_reconcile_shard(shard, nodes).is_some() { @@ -6385,7 +6543,13 @@ impl Service { let mut work = Vec::new(); let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, scheduler) = locked.parts_mut(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // We are going to plan a bunch of optimisations before applying any of them, so the + // utilisation stats on nodes will be effectively stale for the >1st optimisation we + // generate. To avoid this causing unstable migrations/flapping, it's important that the + // code in TenantShard for finding optimisations uses [`NodeAttachmentSchedulingScore::disregard_utilization`] + // to ignore the utilisation component of the score. for (_tenant_id, schedule_context, shards) in TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) @@ -6416,13 +6580,28 @@ impl Service { continue; } - // TODO: optimization calculations are relatively expensive: create some fast-path for - // the common idle case (avoiding the search on tenants that we have recently checked) + // Fast path: we may quickly identify shards that don't have any possible optimisations + if !shard.maybe_optimizable(scheduler, &schedule_context) { + if cfg!(feature = "testing") { + // Check that maybe_optimizable doesn't disagree with the actual optimization functions. + // Only do this in testing builds because it is not a correctness-critical check, so we shouldn't + // panic in prod if we hit this, or spend cycles on it in prod. + assert!(shard + .optimize_attachment(scheduler, &schedule_context) + .is_none()); + assert!(shard + .optimize_secondary(scheduler, &schedule_context) + .is_none()); + } + continue; + } + if let Some(optimization) = - // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // If idle, maybe optimize attachments: if a shard has a secondary location that is preferable to // its primary location based on soft constraints, cut it over. - shard.optimize_attachment(nodes, &schedule_context) + shard.optimize_attachment(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for attachment: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } else if let Some(optimization) = @@ -6432,6 +6611,7 @@ impl Service { // in the same tenant with secondary locations on the node where they originally split. shard.optimize_secondary(scheduler, &schedule_context) { + tracing::info!(tenant_shard_id=%shard.tenant_shard_id, "Identified optimization for secondary: {optimization:?}"); work.push((shard.tenant_shard_id, optimization)); break; } @@ -6480,8 +6660,10 @@ impl Service { } } } - ScheduleOptimizationAction::ReplaceSecondary(_) => { - // No extra checks needed to replace a secondary: this does not interrupt client access + ScheduleOptimizationAction::ReplaceSecondary(_) + | ScheduleOptimizationAction::CreateSecondary(_) + | ScheduleOptimizationAction::RemoveSecondary(_) => { + // No extra checks needed to manage secondaries: this does not interrupt client access validated_work.push((tenant_shard_id, optimization)) } }; @@ -6553,26 +6735,35 @@ impl Service { /// we have this helper to move things along faster. #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { - let (attached_node, secondary_node) = { + let (attached_node, secondaries) = { let locked = self.inner.read().unwrap(); let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: not found" + ); return; }; - let (Some(attached), Some(secondary)) = ( - shard.intent.get_attached(), - shard.intent.get_secondary().first(), - ) else { + + let Some(attached) = shard.intent.get_attached() else { + tracing::warn!( + "Skipping kick of secondary download for {tenant_shard_id}: no attached" + ); return; }; - ( - locked.nodes.get(attached).unwrap().clone(), - locked.nodes.get(secondary).unwrap().clone(), - ) + + let secondaries = shard + .intent + .get_secondary() + .iter() + .map(|n| locked.nodes.get(n).unwrap().clone()) + .collect::>(); + + (locked.nodes.get(attached).unwrap().clone(), secondaries) }; // Make remote API calls to upload + download heatmaps: we ignore errors because this is just // a 'kick' to let scheduling optimisation run more promptly. - attached_node + match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, &self.config.jwt_token, @@ -6581,22 +6772,57 @@ impl Service { SHORT_RECONCILE_TIMEOUT, &self.cancel, ) - .await; + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to upload heatmap from {attached_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!( + "Cancelled while uploading heatmap from {attached_node} for {tenant_shard_id}" + ); + } + Some(Ok(_)) => { + tracing::info!( + "Successfully uploaded heatmap from {attached_node} for {tenant_shard_id}" + ); + } + } - secondary_node - .with_client_retries( - |client| async move { - client - .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) - .await - }, - &self.config.jwt_token, - 3, - 10, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; + for secondary_node in secondaries { + match secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(Duration::from_secs(1)), + ) + .await + }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + Some(Err(e)) => { + tracing::info!( + "Failed to download heatmap from {secondary_node} for {tenant_shard_id}: {e}" + ); + } + None => { + tracing::info!("Cancelled while downloading heatmap from {secondary_node} for {tenant_shard_id}"); + } + Some(Ok(progress)) => { + tracing::info!("Successfully downloaded heatmap from {secondary_node} for {tenant_shard_id}: {progress:?}"); + } + } + } } /// Look for shards which are oversized and in need of splitting @@ -7032,9 +7258,15 @@ impl Service { fn fill_node_plan(&self, node_id: NodeId) -> Vec { let mut locked = self.inner.write().unwrap(); let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); + let (nodes, tenants, _scheduler) = locked.parts_mut(); - let mut tids_by_node = locked - .tenants + let node_az = nodes + .get(&node_id) + .expect("Node must exist") + .get_availability_zone_id() + .clone(); + + let mut tids_by_node = tenants .iter_mut() .filter_map(|(tid, tenant_shard)| { if !matches!( @@ -7047,6 +7279,25 @@ impl Service { return None; } + // AZ check: when filling nodes after a restart, our intent is to move _back_ the + // shards which belong on this node, not to promote shards whose scheduling preference + // would be on their currently attached node. So will avoid promoting shards whose + // home AZ doesn't match the AZ of the node we're filling. + match tenant_shard.preferred_az() { + None => { + // Shard doesn't have an AZ preference: it is elegible to be moved. + } + Some(az) if az == &node_az => { + // This shard's home AZ is equal to the node we're filling: it is + // elegible to be moved: fall through; + } + Some(_) => { + // This shard's home AZ is somewhere other than the node we're filling: + // do not include it in the fill plan. + return None; + } + } + if tenant_shard.intent.get_secondary().contains(&node_id) { if let Some(primary) = tenant_shard.intent.get_attached() { return Some((*primary, *tid)); diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs index d38010a27e..dd6913e988 100644 --- a/storage_controller/src/service/context_iterator.rs +++ b/storage_controller/src/service/context_iterator.rs @@ -43,9 +43,6 @@ impl<'a> Iterator for TenantShardContextIterator<'a> { // Accumulate the schedule context for all the shards in a tenant schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } tenant_shards.push(shard); if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 { @@ -115,7 +112,7 @@ mod tests { assert_eq!(tenant_id, t1_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t2_id); @@ -124,13 +121,13 @@ mod tests { assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2)); assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3)); assert_eq!(shards.len(), 4); - assert_eq!(context.attach_count(), 4); + assert_eq!(context.location_count(), 8); let (tenant_id, context, shards) = iter.next().unwrap(); assert_eq!(tenant_id, t3_id); assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); assert_eq!(shards.len(), 1); - assert_eq!(context.attach_count(), 1); + assert_eq!(context.location_count(), 2); for shard in tenants.values_mut() { shard.intent.clear(&mut scheduler); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index c17989a316..79ed628c25 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -11,16 +11,14 @@ use crate::{ persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ - AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext, - SecondaryShardTag, + AffinityScore, AttachedShardTag, NodeSchedulingScore, NodeSecondarySchedulingScore, + RefCountUpdate, ScheduleContext, SecondaryShardTag, ShardTag, }, service::ReconcileResultRequest, }; use futures::future::{self, Either}; use itertools::Itertools; -use pageserver_api::controller_api::{ - AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, -}; +use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, shard::{ShardIdentity, TenantShardId}, @@ -33,6 +31,7 @@ use utils::{ generation::Generation, id::NodeId, seqwait::{SeqWait, SeqWaitError}, + shard::ShardCount, sync::gate::GateGuard, }; @@ -147,45 +146,67 @@ pub(crate) struct TenantShard { // Support/debug tool: if something is going wrong or flapping with scheduling, this may // be set to a non-active state to avoid making changes while the issue is fixed. scheduling_policy: ShardSchedulingPolicy, +} + +#[derive(Clone, Debug, Serialize)] +pub(crate) struct IntentState { + attached: Option, + secondary: Vec, // We should attempt to schedule this shard in the provided AZ to // decrease chances of cross-AZ compute. preferred_az_id: Option, } -#[derive(Default, Clone, Debug, Serialize)] -pub(crate) struct IntentState { - attached: Option, - secondary: Vec, -} - impl IntentState { - pub(crate) fn new() -> Self { + pub(crate) fn new(preferred_az_id: Option) -> Self { Self { attached: None, secondary: vec![], + preferred_az_id, } } - pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + pub(crate) fn single( + scheduler: &mut Scheduler, + node_id: Option, + preferred_az_id: Option, + ) -> Self { if let Some(node_id) = node_id { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + node_id, + preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } Self { attached: node_id, secondary: vec![], + preferred_az_id, } } pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { if self.attached != new_attached { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } if let Some(new_attached) = &new_attached { - scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); + scheduler.update_node_ref_counts( + *new_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Attach, + ); } self.attached = new_attached; } + + if let Some(new_attached) = &new_attached { + assert!(!self.secondary.contains(new_attached)); + } } /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from @@ -204,15 +225,28 @@ impl IntentState { let demoted = self.attached; self.attached = Some(promote_secondary); - scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + scheduler.update_node_ref_counts( + promote_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::PromoteSecondary, + ); if let Some(demoted) = demoted { - scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + demoted, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); } } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { - debug_assert!(!self.secondary.contains(&new_secondary)); - scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); + assert!(!self.secondary.contains(&new_secondary)); + assert!(self.attached != Some(new_secondary)); + scheduler.update_node_ref_counts( + new_secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::AddSecondary, + ); self.secondary.push(new_secondary); } @@ -220,27 +254,43 @@ impl IntentState { pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { let index = self.secondary.iter().position(|n| *n == node_id); if let Some(index) = index { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); self.secondary.remove(index); } } pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { for secondary in self.secondary.drain(..) { - scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + secondary, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } /// Remove the last secondary node from the list of secondaries pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { if let Some(node_id) = self.secondary.pop() { - scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::RemoveSecondary, + ); } } pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { - scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + scheduler.update_node_ref_counts( + old_attached, + self.preferred_az_id.as_ref(), + RefCountUpdate::Detach, + ); } self.clear_secondary(scheduler); @@ -275,7 +325,11 @@ impl IntentState { if self.attached == Some(node_id) { self.attached = None; self.secondary.push(node_id); - scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); + scheduler.update_node_ref_counts( + node_id, + self.preferred_az_id.as_ref(), + RefCountUpdate::DemoteAttached, + ); true } else { false @@ -315,6 +369,7 @@ pub(crate) struct ObservedStateLocation { /// we know that we might have some state on this node. pub(crate) conf: Option, } + pub(crate) struct ReconcilerWaiter { // For observability purposes, remember the ID of the shard we're // waiting for. @@ -360,6 +415,10 @@ pub(crate) enum ScheduleOptimizationAction { ReplaceSecondary(ReplaceSecondary), // Migrate attachment to an existing secondary location MigrateAttachment(MigrateAttachment), + // Create a secondary location, with the intent of later migrating to it + CreateSecondary(NodeId), + // Remove a secondary location that we previously created to facilitate a migration + RemoveSecondary(NodeId), } #[derive(Eq, PartialEq, Debug, Clone)] @@ -486,7 +545,7 @@ impl TenantShard { Self { tenant_shard_id, policy, - intent: IntentState::default(), + intent: IntentState::new(preferred_az_id), generation: Some(Generation::new(0)), shard, observed: ObservedState::default(), @@ -500,7 +559,6 @@ impl TenantShard { last_error: Arc::default(), pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), - preferred_az_id, } } @@ -563,7 +621,7 @@ impl TenantShard { return Ok((false, node_id)); } - if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + if let Some(promote_secondary) = self.preferred_secondary(scheduler) { // Promote a secondary tracing::debug!("Promoted secondary {} to attached", promote_secondary); self.intent.promote_attached(scheduler, promote_secondary); @@ -572,7 +630,7 @@ impl TenantShard { // Pick a fresh node: either we had no secondaries or none were schedulable let node_id = scheduler.schedule_shard::( &self.intent.secondary, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; tracing::debug!("Selected {} as attached", node_id); @@ -594,9 +652,6 @@ impl TenantShard { let r = self.do_schedule(scheduler, context); context.avoid(&self.intent.all_pageservers()); - if let Some(attached) = self.intent.get_attached() { - context.push_attached(*attached); - } r } @@ -631,24 +686,7 @@ impl TenantShard { use PlacementPolicy::*; match self.policy { Attached(secondary_count) => { - let retain_secondaries = if self.intent.attached.is_none() - && scheduler.node_preferred(&self.intent.secondary).is_some() - { - // If we have no attached, and one of the secondaries is elegible to be promoted, retain - // one more secondary than we usually would, as one of them will become attached futher down this function. - secondary_count + 1 - } else { - secondary_count - }; - - while self.intent.secondary.len() > retain_secondaries { - // We have no particular preference for one secondary location over another: just - // arbitrarily drop from the end - self.intent.pop_secondary(scheduler); - modified = true; - } - - // Should have exactly one attached, and N secondaries + // Should have exactly one attached, and at least N secondaries let (modified_attached, attached_node_id) = self.schedule_attached(scheduler, context)?; modified |= modified_attached; @@ -657,7 +695,7 @@ impl TenantShard { while self.intent.secondary.len() < secondary_count { let node_id = scheduler.schedule_shard::( &used_pageservers, - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); @@ -674,7 +712,7 @@ impl TenantShard { // Populate secondary by scheduling a fresh node let node_id = scheduler.schedule_shard::( &[], - &self.preferred_az_id, + &self.intent.preferred_az_id, context, )?; self.intent.push_secondary(scheduler, node_id); @@ -718,7 +756,7 @@ impl TenantShard { ) -> Result<(), ScheduleError> { let promote_to = match promote_to { Some(node) => node, - None => match scheduler.node_preferred(self.intent.get_secondary()) { + None => match self.preferred_secondary(scheduler) { Some(node) => node, None => { return Err(ScheduleError::ImpossibleConstraint); @@ -745,90 +783,276 @@ impl TenantShard { Ok(()) } + /// Returns None if the current location's score is unavailable, i.e. cannot draw a conclusion + fn is_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + candidate: NodeId, + ) -> Option { + let Some(candidate_score) = scheduler.compute_node_score::( + candidate, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // The candidate node is unavailable for scheduling or otherwise couldn't get a score + return None; + }; + + match scheduler.compute_node_score::( + current, + &self.intent.preferred_az_id, + schedule_context, + ) { + Some(current_score) => { + // Ignore utilization components when comparing scores: we don't want to migrate + // because of transient load variations, it risks making the system thrash, and + // migrating for utilization requires a separate high level view of the system to + // e.g. prioritize moving larger or smaller tenants, rather than arbitrarily + // moving things around in the order that we hit this function. + let candidate_score = candidate_score.for_optimization(); + let current_score = current_score.for_optimization(); + + if candidate_score < current_score { + tracing::info!("Found a lower scoring location! {candidate} is better than {current} ({candidate_score:?} is better than {current_score:?})"); + Some(true) + } else { + // The candidate node is no better than our current location, so don't migrate + tracing::debug!( + "Candidate node {candidate} is no better than our current location {current} (candidate {candidate_score:?} vs current {current_score:?})", + ); + Some(false) + } + } + None => { + // The current node is unavailable for scheduling, so we can't make any sensible + // decisions about optimisation. This should be a transient state -- if the node + // is offline then it will get evacuated, if is blocked by a scheduling mode + // then we will respect that mode by doing nothing. + tracing::debug!("Current node {current} is unavailable for scheduling"); + None + } + } + } + + fn find_better_location( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + current: NodeId, + hard_exclude: &[NodeId], + ) -> Option { + // Look for a lower-scoring location to attach to + let Ok(candidate_node) = scheduler.schedule_shard::( + hard_exclude, + &self.intent.preferred_az_id, + schedule_context, + ) else { + // A scheduling error means we have no possible candidate replacements + tracing::debug!("No candidate node found"); + return None; + }; + + if candidate_node == current { + // We're already at the best possible location, so don't migrate + tracing::debug!("Candidate node {candidate_node} is already in use"); + return None; + } + + self.is_better_location::(scheduler, schedule_context, current, candidate_node) + .and_then(|better| if better { Some(candidate_node) } else { None }) + } + + /// This function is an optimization, used to avoid doing large numbers of scheduling operations + /// when looking for optimizations. This function uses knowledge of how scores work to do some + /// fast checks for whether it may to be possible to improve a score. + /// + /// If we return true, it only means that optimization _might_ be possible, not that it necessarily is. If we + /// return no, it definitely means that calling [`Self::optimize_attachment`] or [`Self::optimize_secondary`] would do no + /// work. + pub(crate) fn maybe_optimizable( + &self, + scheduler: &mut Scheduler, + schedule_context: &ScheduleContext, + ) -> bool { + // Sharded tenant: check if any locations have a nonzero affinity score + if self.shard.count >= ShardCount(1) { + let schedule_context = schedule_context.project_detach(self); + for node in self.intent.all_pageservers() { + if let Some(af) = schedule_context.nodes.get(&node) { + if *af > AffinityScore(0) { + return true; + } + } + } + } + + // Attached tenant: check if the attachment is outside the preferred AZ + if let PlacementPolicy::Attached(_) = self.policy { + if let Some(attached) = self.intent.get_attached() { + if scheduler.get_node_az(attached) != self.intent.preferred_az_id { + return true; + } + } + } + + // Tenant with secondary locations: check if any are within the preferred AZ + for secondary in self.intent.get_secondary() { + if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + return true; + } + } + + // Does the tenant have excess secondaries? + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + return true; + } + + // Fall through: no optimizations possible + false + } + /// Optimize attachments: if a shard has a secondary location that is preferable to /// its primary location based on soft constraints, switch that secondary location /// to be attached. #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn optimize_attachment( &self, - nodes: &HashMap, + scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { let attached = (*self.intent.get_attached())?; - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. - return None; - } - let current_affinity_score = schedule_context.get_node_affinity(attached); - let current_attachment_count = schedule_context.get_node_attachments(attached); + let schedule_context = schedule_context.project_detach(self); - // Generate score for each node, dropping any un-schedulable nodes. - let all_pageservers = self.intent.all_pageservers(); - let mut scores = all_pageservers - .iter() - .flat_map(|node_id| { - let node = nodes.get(node_id); - if node.is_none() { - None - } else if matches!( - node.unwrap().get_scheduling(), - NodeSchedulingPolicy::Filling - ) { - // If the node is currently filling, don't count it as a candidate to avoid, - // racing with the background fill. - None - } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { - None - } else { - let affinity_score = schedule_context.get_node_affinity(*node_id); - let attachment_count = schedule_context.get_node_attachments(*node_id); - Some((*node_id, affinity_score, attachment_count)) - } - }) - .collect::>(); - - // Sort precedence: - // 1st - prefer nodes with the lowest total affinity score - // 2nd - prefer nodes with the lowest number of attachments in this context - // 3rd - if all else is equal, sort by node ID for determinism in tests. - scores.sort_by_key(|i| (i.1, i.2, i.0)); - - if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = - scores.first() - { - if attached != *preferred_node { - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called (e.g. there's no point migrating from - // a location with score 1 to a score zero, because on next location the situation - // would be the same, but in reverse). - if current_affinity_score > *preferred_affinity_score + AffinityScore(1) - || current_attachment_count > *preferred_attachment_count + 1 - { - tracing::info!( - "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", - self.intent.get_secondary() - ); - return Some(ScheduleOptimization { - sequence: self.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: attached, - new_attached_node_id: *preferred_node, - }), - }); - } - } else { - tracing::debug!( - "Node {} is already preferred (score {:?})", - preferred_node, - preferred_affinity_score - ); + // If we already have a secondary that is higher-scoring than out current location, + // then simply migrate to it. + for secondary in self.intent.get_secondary() { + if let Some(true) = self.is_better_location::( + scheduler, + &schedule_context, + attached, + *secondary, + ) { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *secondary, + }), + }); } } - // Fall-through: we didn't find an optimization - None + // Given that none of our current secondaries is a better location than our current + // attached location (checked above), we may trim any secondaries that are not needed + // for the placement policy. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // This code path cleans up extra secondaries after migrating, and/or + // trims extra secondaries after a PlacementPolicy::Attached(N) was + // modified to decrease N. + + let secondary_scores = self + .intent + .get_secondary() + .iter() + .map(|node_id| { + ( + *node_id, + scheduler.compute_node_score::( + *node_id, + &self.intent.preferred_az_id, + &schedule_context, + ), + ) + }) + .collect::>(); + + if secondary_scores.iter().any(|score| score.1.is_none()) { + // Don't have full list of scores, so can't make a good decision about which to drop unless + // there is an obvious one in the wrong AZ + for secondary in self.intent.get_secondary() { + if scheduler.get_node_az(secondary) == self.intent.preferred_az_id { + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(*secondary), + }); + } + } + + // Fall through: we didn't identify one to remove. This ought to be rare. + tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)", + self.intent.get_secondary() + ); + } else { + let victim = secondary_scores + .iter() + .max_by_key(|score| score.1.unwrap()) + .unwrap() + .0; + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(victim), + }); + } + } + + let replacement = self.find_better_location::( + scheduler, + &schedule_context, + attached, + &[], // Don't exclude secondaries: our preferred attachment location may be a secondary + ); + + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. + if let Some(replacement) = replacement { + // If we are currently in non-preferred AZ, then the scheduler might suggest a location that is better, but still + // not in our preferred AZ. Migration has a cost in resources an impact to the workload, so we want to avoid doing + // multiple hops where we might go to some other AZ before eventually finding a suitable location in our preferred + // AZ: skip this optimization if it is not in our final, preferred AZ. + // + // This should be a transient state, there should always be capacity eventually in our preferred AZ (even if nodes + // there are too overloaded for scheduler to suggest them, more should be provisioned eventually). + if self.intent.preferred_az_id.is_some() + && scheduler.get_node_az(&replacement) != self.intent.preferred_az_id + { + tracing::debug!( + "Candidate node {replacement} is not in preferred AZ {:?}", + self.intent.preferred_az_id + ); + + // This should only happen if our current location is not in the preferred AZ, otherwise + // [`Self::find_better_location`]` should have rejected any other location outside the preferred Az, because + // AZ is the highest priority part of NodeAttachmentSchedulingScore. + debug_assert!(scheduler.get_node_az(&attached) != self.intent.preferred_az_id); + + return None; + } + + if !self.intent.get_secondary().contains(&replacement) { + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::CreateSecondary(replacement), + }) + } else { + // We already have a secondary in the preferred location, let's try migrating to it. Our caller + // will check the warmth of the destination before deciding whether to really execute this. + Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: replacement, + }), + }) + } + } else { + // We didn't find somewhere we'd rather be, and we don't have any excess secondaries + // to clean up: no action required. + None + } } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] @@ -837,50 +1061,40 @@ impl TenantShard { scheduler: &mut Scheduler, schedule_context: &ScheduleContext, ) -> Option { - if self.intent.secondary.is_empty() { - // We can only do useful work if we have both attached and secondary locations: this - // function doesn't schedule new locations, only swaps between attached and secondaries. + if self.intent.get_secondary().len() > self.policy.want_secondaries() { + // We have extra secondaries, perhaps to facilitate a migration of the attached location: + // do nothing, it is up to [`Self::optimize_attachment`] to clean them up. When that's done, + // and we are called again, we will proceed. + tracing::debug!("Too many secondaries: skipping"); return None; } + let schedule_context = schedule_context.project_detach(self); + for secondary in self.intent.get_secondary() { - let Some(affinity_score) = schedule_context.nodes.get(secondary) else { - // We're already on a node unaffected any affinity constraints, - // so we won't change it. - continue; + // Make sure we don't try to migrate a secondary to our attached location: this case happens + // easily in environments without multiple AZs. + let exclude = match self.intent.attached { + Some(attached) => vec![attached], + None => vec![], }; - // Let the scheduler suggest a node, where it would put us if we were scheduling afresh - // This implicitly limits the choice to nodes that are available, and prefers nodes - // with lower utilization. - let Ok(candidate_node) = scheduler.schedule_shard::( - &self.intent.all_pageservers(), - &self.preferred_az_id, - schedule_context, - ) else { - // A scheduling error means we have no possible candidate replacements - continue; - }; - - let candidate_affinity_score = schedule_context - .nodes - .get(&candidate_node) - .unwrap_or(&AffinityScore::FREE); - - // The best alternative must be more than 1 better than us, otherwise we could end - // up flapping back next time we're called. - if *candidate_affinity_score + AffinityScore(1) < *affinity_score { - // If some other node is available and has a lower score than this node, then - // that other node is a good place to migrate to. - tracing::info!( - "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", - self.intent.get_secondary() - ); + let replacement = self.find_better_location::( + scheduler, + &schedule_context, + *secondary, + &exclude, + ); + assert!(replacement != Some(*secondary)); + if let Some(replacement) = replacement { + // We have found a candidate and confirmed that its score is preferable + // to our current location. See if we have a secondary location in the preferred location already: if not, + // then create one. return Some(ScheduleOptimization { sequence: self.sequence, action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { old_node_id: *secondary, - new_node_id: candidate_node, + new_node_id: replacement, }), }); } @@ -921,11 +1135,54 @@ impl TenantShard { self.intent.remove_secondary(scheduler, old_node_id); self.intent.push_secondary(scheduler, new_node_id); } + ScheduleOptimizationAction::CreateSecondary(new_node_id) => { + self.intent.push_secondary(scheduler, new_node_id); + } + ScheduleOptimizationAction::RemoveSecondary(old_secondary) => { + self.intent.remove_secondary(scheduler, old_secondary); + } } true } + /// When a shard has several secondary locations, we need to pick one in situations where + /// we promote one of them to an attached location: + /// - When draining a node for restart + /// - When responding to a node failure + /// + /// In this context, 'preferred' does not mean the node with the best scheduling score: instead + /// we want to pick the node which is best for use _temporarily_ while the previous attached location + /// is unavailable (e.g. because it's down or deploying). That means we prefer to use secondary + /// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary + /// secondary in the preferred AZ (which are usually only created for migrations, and if they exist + /// they're probably not warmed up yet). The latter behavior is based oni + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller needs to a pick a node some other way. + pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option { + let candidates = scheduler.filter_usable_nodes(&self.intent.secondary); + + // We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer + // to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ), + // rather than a short-lived secondary location being used for optimization/migration (which would have + // been scheduled in our preferred AZ). + let mut candidates = candidates + .iter() + .map(|(node_id, node_az)| { + if node_az == &self.intent.preferred_az_id { + (1, *node_id) + } else { + (0, *node_id) + } + }) + .collect::>(); + + candidates.sort(); + + candidates.first().map(|i| i.1) + } + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. @@ -1122,10 +1379,15 @@ impl TenantShard { let result = reconciler.reconcile().await; // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work + // of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be + // sending a notification of a location that isn't really attached. if result.is_ok() && must_notify { // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] reconciler.compute_notify().await.ok(); + } else if must_notify { + // Carry this flag so that the reconciler's result will indicate that it still needs to retry + // the compute hook notification eventually. + reconciler.compute_notify_failure = true; } // Update result counter @@ -1202,7 +1464,7 @@ impl TenantShard { detach, reconciler_config, config: self.config.clone(), - preferred_az: self.preferred_az_id.clone(), + preferred_az: self.intent.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), @@ -1423,7 +1685,6 @@ impl TenantShard { pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), - preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone), }) } @@ -1439,16 +1700,16 @@ impl TenantShard { config: serde_json::to_string(&self.config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), - preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()), + preferred_az_id: self.intent.preferred_az_id.as_ref().map(|az| az.0.clone()), } } pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> { - self.preferred_az_id.as_ref() + self.intent.preferred_az_id.as_ref() } - pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { - self.preferred_az_id = Some(preferred_az_id); + pub(crate) fn set_preferred_az(&mut self, preferred_az_id: Option) { + self.intent.preferred_az_id = preferred_az_id; } /// Returns all the nodes to which this tenant shard is attached according to the @@ -1751,65 +2012,90 @@ pub(crate) mod tests { } #[test] - fn optimize_attachment() -> anyhow::Result<()> { - let nodes = make_test_nodes(3, &[]); + /// Simple case: moving attachment to somewhere better where we already have a secondary + fn optimize_attachment_simple() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); let mut scheduler = Scheduler::new(nodes.values()); let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); // Initially: both nodes attached on shard 1, and both have secondary locations // on different nodes. - shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(1)); shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); - shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); - let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } - let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); - - // Either shard should recognize that it has the option to switch to a secondary location where there - // would be no other shards from the same tenant, and request to do so. + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a = shard_a.optimize_attachment(&mut scheduler, &schedule_context); assert_eq!( optimization_a, Some(ScheduleOptimization { sequence: shard_a.sequence, action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(2) + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) }) }) ); - - // Note that these optimizing two shards in the same tenant with the same ScheduleContext is - // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility - // of [`Service::optimize_all`] to avoid trying - // to do optimizations for multiple shards in the same tenant at the same time. Generating - // both optimizations is just done for test purposes - let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); - assert_eq!( - optimization_b, - Some(ScheduleOptimization { - sequence: shard_b.sequence, - action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { - old_attached_node_id: NodeId(1), - new_attached_node_id: NodeId(3) - }) - }) - ); - - // Applying these optimizations should result in the end state proposed shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); - assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); - assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); - shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); - assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); - assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + // // Either shard should recognize that it has the option to switch to a secondary location where there + // // would be no other shards from the same tenant, and request to do so. + // assert_eq!( + // optimization_a_prepare, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::CreateSecondary(NodeId(2)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_migrate, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + // old_attached_node_id: NodeId(1), + // new_attached_node_id: NodeId(2) + // }) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + // assert_eq!( + // optimization_a_cleanup, + // Some(ScheduleOptimization { + // sequence: shard_a.sequence, + // action: ScheduleOptimizationAction::RemoveSecondary(NodeId(1)) + // }) + // ); + // shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); shard_a.intent.clear(&mut scheduler); shard_b.intent.clear(&mut scheduler); @@ -1817,6 +2103,190 @@ pub(crate) mod tests { Ok(()) } + #[test] + /// Complicated case: moving attachment to somewhere better where we do not have a secondary + /// already, creating one as needed. + fn optimize_attachment_multistep() -> anyhow::Result<()> { + let nodes = make_test_nodes( + 3, + &[ + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Two shards of a tenant that wants to be in AZ A + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_a.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + shard_b.intent.preferred_az_id = Some(AvailabilityZone("az-a".to_string())); + + // Both shards are initially attached in non-home AZ _and_ have secondaries in non-home AZs + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(3))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(2)); + + fn make_schedule_context(shard_a: &TenantShard, shard_b: &TenantShard) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context + } + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_prepare = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_migrate = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shard_a, &shard_b); + let optimization_a_cleanup = shard_a.optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3)) + }) + ); + shard_a.apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // // Shard B should not be moved anywhere, since the pressure on node 1 was relieved by moving shard A + // let schedule_context = make_schedule_context(&shard_a, &shard_b); + // assert_eq!(shard_b.optimize_attachment(&mut scheduler, &schedule_context), None); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + /// Check that multi-step migration works when moving to somewhere that is only better by + /// 1 AffinityScore -- this ensures that we don't have a bug like the intermediate secondary + /// counting toward the affinity score such that it prevents the rest of the migration from happening. + fn optimize_attachment_marginal() -> anyhow::Result<()> { + let nodes = make_test_nodes(2, &[]); + let mut scheduler = Scheduler::new(nodes.values()); + + // Multi-sharded tenant, we will craft a situation where affinity + // scores differ only slightly + let mut shards = make_test_tenant(PlacementPolicy::Attached(0), ShardCount::new(4), None); + + // 1 attached on node 1 + shards[0] + .intent + .set_attached(&mut scheduler, Some(NodeId(1))); + // 3 attached on node 2 + shards[1] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[2] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + shards[3] + .intent + .set_attached(&mut scheduler, Some(NodeId(2))); + + // The scheduler should figure out that we need to: + // - Create a secondary for shard 3 on node 1 + // - Migrate shard 3 to node 1 + // - Remove shard 3's location on node 2 + + fn make_schedule_context(shards: &Vec) -> ScheduleContext { + let mut schedule_context = ScheduleContext::default(); + for shard in shards { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + schedule_context + } + + let schedule_context = make_schedule_context(&shards); + let optimization_a_prepare = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_prepare, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::CreateSecondary(NodeId(1)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_prepare.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_migrate = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_migrate, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(2), + new_attached_node_id: NodeId(1) + }) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_migrate.unwrap()); + + let schedule_context = make_schedule_context(&shards); + let optimization_a_cleanup = + shards[1].optimize_attachment(&mut scheduler, &schedule_context); + assert_eq!( + optimization_a_cleanup, + Some(ScheduleOptimization { + sequence: shards[1].sequence, + action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2)) + }) + ); + shards[1].apply_optimization(&mut scheduler, optimization_a_cleanup.unwrap()); + + // Everything should be stable now + let schedule_context = make_schedule_context(&shards); + assert_eq!( + shards[0].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[1].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[2].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + assert_eq!( + shards[3].optimize_attachment(&mut scheduler, &schedule_context), + None + ); + + for mut shard in shards { + shard.intent.clear(&mut scheduler); + } + + Ok(()) + } + #[test] fn optimize_secondary() -> anyhow::Result<()> { let nodes = make_test_nodes(4, &[]); @@ -1834,9 +2304,7 @@ pub(crate) mod tests { let mut schedule_context = ScheduleContext::default(); schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); @@ -1867,7 +2335,6 @@ pub(crate) mod tests { // called repeatedly in the background. // Returns the applied optimizations fn optimize_til_idle( - nodes: &HashMap, scheduler: &mut Scheduler, shards: &mut [TenantShard], ) -> Vec { @@ -1879,14 +2346,18 @@ pub(crate) mod tests { for shard in shards.iter() { schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } } for shard in shards.iter_mut() { - let optimization = shard.optimize_attachment(nodes, &schedule_context); + let optimization = shard.optimize_attachment(scheduler, &schedule_context); + tracing::info!( + "optimize_attachment({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1894,7 +2365,15 @@ pub(crate) mod tests { } let optimization = shard.optimize_secondary(scheduler, &schedule_context); + tracing::info!( + "optimize_secondary({})={:?}", + shard.tenant_shard_id, + optimization + ); if let Some(optimization) = optimization { + // Check that maybe_optimizable wouldn't have wrongly claimed this optimization didn't exist + assert!(shard.maybe_optimizable(scheduler, &schedule_context)); + optimizations.push(optimization.clone()); shard.apply_optimization(scheduler, optimization); any_changed = true; @@ -1918,14 +2397,34 @@ pub(crate) mod tests { /// that it converges. #[test] fn optimize_add_nodes() -> anyhow::Result<()> { - let nodes = make_test_nodes(4, &[]); + let nodes = make_test_nodes( + 9, + &[ + // Initial 6 nodes + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + AvailabilityZone("az-c".to_string()), + // Three we will add later + AvailabilityZone("az-a".to_string()), + AvailabilityZone("az-b".to_string()), + AvailabilityZone("az-c".to_string()), + ], + ); - // Only show the scheduler a couple of nodes + // Only show the scheduler two nodes in each AZ to start with let mut scheduler = Scheduler::new([].iter()); - scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + for i in 1..=6 { + scheduler.node_upsert(nodes.get(&NodeId(i)).unwrap()); + } - let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None); + let mut shards = make_test_tenant( + PlacementPolicy::Attached(1), + ShardCount::new(4), + Some(AvailabilityZone("az-a".to_string())), + ); let mut schedule_context = ScheduleContext::default(); for shard in &mut shards { assert!(shard @@ -1933,30 +2432,50 @@ pub(crate) mod tests { .is_ok()); } - // We should see equal number of locations on the two nodes. - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + // Initial: attached locations land in the tenant's home AZ. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2); - - assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - // Add another two nodes: we should see the shards spread out when their optimize - // methods are called - scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); - scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); - optimize_til_idle(&nodes, &mut scheduler, &mut shards); + // Initial: secondary locations in a remote AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); - assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + // Add another three nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(7)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(8)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(9)).unwrap()); + optimize_til_idle(&mut scheduler, &mut shards); + + // We expect one attached location was moved to the new node in the tenant's home AZ + assert_eq!(scheduler.get_node_shard_count(NodeId(7)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(7)), 1); + // The original node has one less attached shard + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + // One of the original nodes still has two attachments, since there are an odd number of nodes assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2); - assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); - - assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); - assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); + // None of our secondaries moved, since we already had enough nodes for those to be + // scheduled perfectly + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(5)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(5)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(6)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(6)), 0); for shard in shards.iter_mut() { shard.intent.clear(&mut scheduler); @@ -1996,10 +2515,10 @@ pub(crate) mod tests { shard.schedule(&mut scheduler, context).unwrap(); } - let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a); + let applied_to_a = optimize_til_idle(&mut scheduler, &mut a); assert_eq!(applied_to_a, vec![]); - let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b); + let applied_to_b = optimize_til_idle(&mut scheduler, &mut b); assert_eq!(applied_to_b, vec![]); for shard in a.iter_mut().chain(b.iter_mut()) { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c5295360c3..fa541bad17 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -131,7 +131,6 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index caa89955e3..76c3ad01a4 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import re +import threading from pathlib import Path import pytest @@ -188,7 +189,20 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): check_pgbench_output(out_path) - with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + stop_pump = threading.Event() + + def pump_controller(): + # Run a background loop to force the storage controller to run its + # background work faster than it otherwise would: this helps + # us: + # A) to create a test that runs in a shorter time + # B) to create a test that is more intensive by doing the shard migrations + # after splits happen more rapidly. + while not stop_pump.is_set(): + env.storage_controller.reconcile_all() + stop_pump.wait(0.1) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count + 1) as pgbench_threads: pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) @@ -198,6 +212,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + pump_fut = pgbench_threads.submit(pump_controller) + pgbench_futs = [] for tenant_state in tenants.values(): fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) @@ -207,6 +223,9 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): for fut in pgbench_futs: fut.result() + stop_pump.set() + pump_fut.result() + def assert_all_split(): for tenant_id in tenants.keys(): shards = tenant_get_shards(env, tenant_id) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 49f41483ec..d45db28c78 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -13,11 +13,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: @@ -85,8 +87,12 @@ def test_storage_controller_many_tenants( ) AZS = ["alpha", "bravo", "charlie"] + + def az_selector(node_id): + return f"az-{AZS[(node_id - 1) % len(AZS)]}" + neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update( - {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"} + {"availability_zone": az_selector(ps_cfg["id"])} ) # A small sleep on each call into the notify hook, to simulate the latency of doing a database write @@ -168,6 +174,31 @@ def test_storage_controller_many_tenants( log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") assert rss < expect_memory_per_shard * total_shards + def assert_all_tenants_scheduled_in_home_az(): + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + preferred_az = None + for shard in desc["shards"]: + # All shards in a tenant should have the same preferred AZ + if preferred_az is None: + preferred_az = shard["preferred_az_id"] + else: + assert preferred_az == shard["preferred_az_id"] + + # Attachment should be in the preferred AZ + assert shard["preferred_az_id"] == az_selector( + shard["node_attached"] + ), f"Shard {shard['tenant_shard_id']} not in {shard['preferred_az_id']}" + + # Secondary locations should not be in the preferred AZ + for node_secondary in shard["node_secondary"]: + assert ( + shard["preferred_az_id"] != az_selector(node_secondary) + ), f"Shard {shard['tenant_shard_id']} secondary should be in {shard['preferred_az_id']}" + + # There should only be one secondary location (i.e. no migrations in flight) + assert len(shard["node_secondary"]) == 1 + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 @@ -242,6 +273,22 @@ def test_storage_controller_many_tenants( f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" ) + # Check initial scheduling + assert_all_tenants_scheduled_in_home_az() + az_attached_counts: defaultdict[str, int] = defaultdict(int) + az_secondary_counts: defaultdict[str, int] = defaultdict(int) + node_attached_counts: defaultdict[str, int] = defaultdict(int) + for tenant_id in tenants.keys(): + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + az_attached_counts[az_selector(shard["node_attached"])] += 1 + node_attached_counts[shard["node_attached"]] += 1 + for node_secondary in shard["node_secondary"]: + az_secondary_counts[az_selector(node_secondary)] += 1 + + log.info(f"Initial node attached counts: {node_attached_counts}") + log.info(f"Initial AZ shard counts: {az_attached_counts}, {az_secondary_counts}") + # Plan operations: ensure each tenant with a timeline gets at least # one of each operation type. Then add other tenants to make up the # numbers. @@ -450,11 +497,77 @@ def test_storage_controller_many_tenants( env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() + # Since we did `reconcile_until_idle` during the above loop, the system should be left in + # an optimally scheduled state. Validate that this includes all the tenants being scheduled + # in their home AZ. + assert_all_tenants_scheduled_in_home_az() + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. env.storage_controller.consistency_check() check_memory() + # Simulate loss of an AZ + victim_az = "az-alpha" + killed_pageservers = [] + for ps in env.pageservers: + if az_selector(ps.id) == victim_az: + ps.stop(immediate=True) + killed_pageservers.append(ps) + log.info(f"Killed pageserver {ps.id}") + + assert killed_pageservers + + # Wait for the controller to notice the pageservers are dead + def assert_pageservers_availability( + pageservers: list[NeonPageserver], expected_availability: PageserverAvailability + ): + nodes = env.storage_controller.nodes() + checked_any = False + node_ids = [ps.id for ps in pageservers] + for node in nodes: + if node["id"] in node_ids: + checked_any = True + assert ( + node["availability"] == expected_availability + ), f"Node {node['id']} is not {expected_availability} yet: {node['availability']}" + + assert checked_any + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.OFFLINE), + timeout=60, + ) + + # Let the controller finish all its rescheduling + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Check that all the tenants are rescheduled to the remaining pageservers + for tenant_id in tenant_ids: + desc = env.storage_controller.tenant_describe(tenant_id) + for shard in desc["shards"]: + # Attachment should be outside the AZ where we killed the pageservers + assert ( + az_selector(shard["node_attached"]) != victim_az + ), f"Shard {shard['tenant_shard_id']} still in {victim_az} (node {shard['node_attached']})" + + # Bring back the pageservers + for ps in killed_pageservers: + ps.start() + + wait_until( + lambda: assert_pageservers_availability(killed_pageservers, PageserverAvailability.ACTIVE), + timeout=60, + ) + + # A very long timeout is required: we will be migrating all the tenants on all the pageservers + # in the region that we just restored. Assume it'll take up to twice as long as it took to fill + # a single node + env.storage_controller.reconcile_until_idle( + max_interval=0.1, timeout_secs=DRAIN_FILL_TIMEOUT * 4 + ) + assert_all_tenants_scheduled_in_home_az() + # Stop the storage controller before tearing down fixtures, because it otherwise might log # errors trying to call our `ComputeReconfigure`. env.storage_controller.stop() diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index ae48a8fc27..fe0422088a 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -84,9 +84,6 @@ page_cache_size=10 log.info("Checking layer access metrics ...") layer_access_metric_names = [ - "pageserver_layers_visited_per_read_global_sum", - "pageserver_layers_visited_per_read_global_count", - "pageserver_layers_visited_per_read_global_bucket", "pageserver_layers_visited_per_vectored_read_global_sum", "pageserver_layers_visited_per_vectored_read_global_count", "pageserver_layers_visited_per_vectored_read_global_bucket", @@ -97,12 +94,6 @@ page_cache_size=10 layer_access_metrics = metrics.query_all(name) log.info(f"Got metrics: {layer_access_metrics}") - non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") - non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") - if non_vectored_count.value != 0: - non_vectored_average = non_vectored_sum.value / non_vectored_count.value - else: - non_vectored_average = 0 vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") if vectored_count.value > 0: @@ -113,11 +104,10 @@ page_cache_size=10 assert vectored_sum.value == 0 vectored_average = 0 - log.info(f"{non_vectored_average=} {vectored_average=}") + log.info(f"{vectored_average=}") # The upper bound for average number of layer visits below (8) # was chosen empirically for this workload. - assert non_vectored_average < 8 assert vectored_average < 8 diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index 71963355b7..5dcc93acff 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -219,7 +219,7 @@ if SQL_EXPORTER is None: # # The "host" network mode allows sql_exporter to talk to the # endpoint which is running on the host. - super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host") + super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host") self.__logs_dir = logs_dir self.__port = port @@ -252,7 +252,7 @@ if SQL_EXPORTER is None: log.info("Waiting for sql_exporter to be ready") wait_for_logs( self, - rf'level=info msg="Listening on" address=\[::\]:{self.__port}', + rf'msg="Listening on" address=\[::\]:{self.__port}', timeout=5, ) @@ -344,10 +344,7 @@ else: time.sleep(0.5) continue - if ( - f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}' - in line - ): + if f'msg="Listening on" address=[::]:{self._sql_exporter_port}' in line: break @override diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 6cb11b825d..17819fd367 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -187,7 +187,7 @@ def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_en origin=primary, endpoint_id="secondary", config_lines=[ - "max_connections=2", + "max_connections=5", "autovacuum_max_workers=1", "max_worker_processes=5", "max_wal_senders=1", diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py index ea01252ce4..f14317a39f 100644 --- a/test_runner/regress/test_proxy_websockets.py +++ b/test_runner/regress/test_proxy_websockets.py @@ -1,10 +1,15 @@ from __future__ import annotations +import asyncio import ssl +import asyncpg import pytest +import websocket_tunnel import websockets +from fixtures.log_helper import log from fixtures.neon_fixtures import NeonProxy +from fixtures.port_distributor import PortDistributor @pytest.mark.asyncio @@ -196,3 +201,53 @@ async def test_websockets_pipelined(static_proxy: NeonProxy): # close await websocket.send(b"X\x00\x00\x00\x04") await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_tunneled(static_proxy: NeonProxy, port_distributor: PortDistributor): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + # Launch a tunnel service so that we can speak the websockets protocol to + # the proxy + tunnel_port = port_distributor.get_port() + tunnel_server = await websocket_tunnel.start_server( + "127.0.0.1", + tunnel_port, + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl_context, + ) + log.info(f"websockets tunnel listening for connections on port {tunnel_port}") + + async with tunnel_server: + + async def run_tunnel(): + try: + async with tunnel_server: + await tunnel_server.serve_forever() + except Exception as e: + log.error(f"Error in tunnel task: {e}") + + tunnel_task = asyncio.create_task(run_tunnel()) + + # Ok, the tunnel is now running. Check that we can connect to the proxy's + # websocket interface, through the tunnel + tunnel_connstring = f"postgres://{user}:{password}@127.0.0.1:{tunnel_port}/postgres" + + log.info(f"connecting to {tunnel_connstring}") + conn = await asyncpg.connect(tunnel_connstring) + res = await conn.fetchval("SELECT 123") + assert res == 123 + await conn.close() + log.info("Ran a query successfully through the tunnel") + + tunnel_server.close() + try: + await tunnel_task + except asyncio.CancelledError: + pass diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 673904a1cd..86a6b7428b 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -520,14 +520,18 @@ def test_sharding_split_smoke( shard_count = 2 # Shard count we split into split_shard_count = 4 - # We will have 2 shards per pageserver once done (including secondaries) - neon_env_builder.num_pageservers = split_shard_count + # In preferred AZ & other AZ we will end up with one shard per pageserver + neon_env_builder.num_pageservers = split_shard_count * 2 # Two AZs def assign_az(ps_cfg): az = f"az-{(ps_cfg['id'] - 1) % 2}" ps_cfg["availability_zone"] = az + # We will run more pageservers than tests usually do, so give them tiny page caches + # in case we're on a test node under memory pressure. + ps_cfg["page_cache_size"] = 128 + neon_env_builder.pageserver_config_override = assign_az # 1MiB stripes: enable getting some meaningful data distribution without @@ -679,8 +683,8 @@ def test_sharding_split_smoke( # - shard_count reconciles for the original setup of the tenant # - shard_count reconciles for detaching the original secondary locations during split # - split_shard_count reconciles during shard splitting, for setting up secondaries. - # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move) - expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2 + # - split_shard_count/2 reconciles to migrate shards to their temporary secondaries + expect_reconciles = shard_count * 2 + split_shard_count + 3 * (split_shard_count / 2) reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} @@ -745,10 +749,14 @@ def test_sharding_split_smoke( # dominated by shard count. log.info(f"total: {total}") assert total == { - 1: 2, - 2: 2, - 3: 2, - 4: 2, + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, } # The controller is not required to lay out the attached locations in any particular way, but @@ -1387,13 +1395,7 @@ def test_sharding_split_failures( else: attached_count += 1 - if exclude_ps_id is not None: - # For a node failure case, we expect there to be a secondary location - # scheduled on the offline node, so expect one fewer secondary in total - assert secondary_count == initial_shard_count - 1 - else: - assert secondary_count == initial_shard_count - + assert secondary_count == initial_shard_count assert attached_count == initial_shard_count def assert_split_done(exclude_ps_id: int | None = None) -> None: diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index da6d5b8622..8ffb6ba6b2 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("postgres behavior is not relevant") +def test_storage_controller_compute_hook_retry( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address: ListenAddress, +): + """ + Test that when a reconciler can't do its compute hook notification, it will keep + trying until it succeeds. + + Reproducer for https://github.com/neondatabase/cloud/issues/22612 + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached": 1}') + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: dict[str, list[dict[str, int]] | str | None | int] = { + "tenant_id": str(tenant_id), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } + assert notifications[0] == expect + + # Block notifications, and fail a node + handle_params["status"] = 423 + env.pageservers[0].stop() + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Avoid waiting for heartbeats + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later, + # and we will check that that happens + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Try reconciling again, it should try notifying again + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # The describe API should indicate that a notification is pending + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Unblock notifications: reconcile should work now + handle_params["status"] = 200 + notifications = [] + assert env.storage_controller.reconcile_all() == 1 + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + # Reconciler should be idle now that it succeeded in its compute notification + notifications = [] + assert env.storage_controller.reconcile_all() == 0 + assert len(notifications) == 0 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + @run_only_on_default_postgres("this test doesn't start an endpoint") def test_storage_controller_compute_hook_revert( httpserver: HTTPServer, @@ -936,7 +1052,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): that just hits the endpoints to check that they don't bitrot. """ - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_start() tenant_id = TenantId.generate() @@ -961,7 +1077,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): "GET", f"{env.storage_controller_api}/debug/v1/scheduler" ) # Two nodes, in a dict of node_id->node - assert len(response.json()["nodes"]) == 2 + assert len(response.json()["nodes"]) == 3 assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 assert all(v["may_schedule"] for v in response.json()["nodes"].values()) @@ -972,13 +1088,25 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): headers=env.storage_controller.headers(TokenScope.ADMIN), ) + # Secondary migration API: superficial check that it migrates + secondary_dest = env.pageservers[2].id + env.storage_controller.request( + "PUT", + f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0002/migrate_secondary", + headers=env.storage_controller.headers(TokenScope.ADMIN), + json={"tenant_shard_id": f"{tenant_id}-0002", "node_id": secondary_dest}, + ) + assert env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_secondary"] == [ + secondary_dest + ] + # Node unclean drop API response = env.storage_controller.request( "POST", f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", headers=env.storage_controller.headers(TokenScope.ADMIN), ) - assert len(env.storage_controller.node_list()) == 1 + assert len(env.storage_controller.node_list()) == 2 # Tenant unclean drop API response = env.storage_controller.request( @@ -1696,7 +1824,13 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): """ output_dir = neon_env_builder.test_output_dir shard_count = 4 - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached":1}', shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] def storcon_cli(args): @@ -1725,7 +1859,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # List nodes node_lines = storcon_cli(["nodes"]) # Table header, footer, and one line of data - assert len(node_lines) == 5 + assert len(node_lines) == 7 assert "localhost" in node_lines[3] # Pause scheduling onto a node @@ -1743,10 +1877,21 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) assert "Offline" in storcon_cli(["nodes"])[3] + # Restore node, verify status changes in CLI output + env.pageservers[0].start() + + def is_online(): + assert "Offline" not in storcon_cli(["nodes"]) + + wait_until(is_online) + + # Let everything stabilize after node failure to avoid interfering with subsequent steps + env.storage_controller.reconcile_until_idle(timeout_secs=10) + # List tenants tenant_lines = storcon_cli(["tenants"]) assert len(tenant_lines) == 5 - assert str(env.initial_tenant) in tenant_lines[3] + assert str(tenant_id) in tenant_lines[3] # Setting scheduling policies intentionally result in warnings, they're for rare use. env.storage_controller.allowed_errors.extend( @@ -1754,23 +1899,58 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): ) # Describe a tenant - tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(tenant_id)]) assert len(tenant_lines) >= 3 + shard_count * 2 - assert str(env.initial_tenant) in tenant_lines[0] + assert str(tenant_id) in tenant_lines[0] + + # Migrate an attached location + def other_ps_id(current_ps_id): + return ( + env.pageservers[0].id + if current_ps_id == env.pageservers[1].id + else env.pageservers[1].id + ) + + storcon_cli( + [ + "tenant-shard-migrate", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0]["node_attached"] + ) + ), + ] + ) + + # Migrate a secondary location + storcon_cli( + [ + "tenant-shard-migrate-secondary", + "--tenant-shard-id", + f"{tenant_id}-0004", + "--node", + str( + other_ps_id( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] + ) + ), + ] + ) # Pause changes on a tenant - storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--scheduling", "stop"]) assert "Stop" in storcon_cli(["tenants"])[3] # Cancel ongoing reconcile on a tenant - storcon_cli( - ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"] - ) + storcon_cli(["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{tenant_id}-0104"]) # Change a tenant's placement - storcon_cli( - ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] - ) + storcon_cli(["tenant-policy", "--tenant-id", str(tenant_id), "--placement", "secondary"]) assert "Secondary" in storcon_cli(["tenants"])[3] # Modify a tenant's config @@ -1778,7 +1958,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): [ "patch-tenant-config", "--tenant-id", - str(env.initial_tenant), + str(tenant_id), "--config", json.dumps({"pitr_interval": "1m"}), ] @@ -3033,11 +3213,12 @@ def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): def assign_az(ps_cfg): - az = f"az-{ps_cfg['id']}" + az = f"az-{ps_cfg['id'] % 2}" + log.info("Assigned AZ {az}") ps_cfg["availability_zone"] = az neon_env_builder.pageserver_config_override = assign_az - neon_env_builder.num_pageservers = 2 + neon_env_builder.num_pageservers = 4 env = neon_env_builder.init_configs() env.start() @@ -3052,8 +3233,14 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): assert shards[0]["preferred_az_id"] == expected_az + # When all other schedule scoring parameters are equal, tenants should round-robin on AZs + assert env.storage_controller.tenant_describe(tids[0])["shards"][0]["preferred_az_id"] == "az-0" + assert env.storage_controller.tenant_describe(tids[1])["shards"][0]["preferred_az_id"] == "az-1" + assert env.storage_controller.tenant_describe(tids[2])["shards"][0]["preferred_az_id"] == "az-0" + + # Try modifying preferred AZ updated = env.storage_controller.set_preferred_azs( - {TenantShardId(tid, 0, 0): "foo" for tid in tids} + {TenantShardId(tid, 0, 0): "az-0" for tid in tids} ) assert set(updated) == set([TenantShardId(tid, 0, 0) for tid in tids]) @@ -3061,29 +3248,24 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): for tid in tids: shards = env.storage_controller.tenant_describe(tid)["shards"] assert len(shards) == 1 - assert shards[0]["preferred_az_id"] == "foo" + assert shards[0]["preferred_az_id"] == "az-0" - # Generate a layer to avoid shard split handling on ps from tripping - # up on debug assert. - timeline_id = TimelineId.generate() - env.create_timeline("bar", tids[0], timeline_id) - - workload = Workload(env, tids[0], timeline_id, branch_name="bar") - workload.init() - workload.write_rows(256) - workload.validate() + # Having modified preferred AZ, we should get moved there + env.storage_controller.reconcile_until_idle(max_interval=0.1) + for tid in tids: + shard = env.storage_controller.tenant_describe(tid)["shards"][0] + attached_to = shard["node_attached"] + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" env.storage_controller.tenant_shard_split(tids[0], shard_count=2) + env.storage_controller.reconcile_until_idle(max_interval=0.1) shards = env.storage_controller.tenant_describe(tids[0])["shards"] assert len(shards) == 2 for shard in shards: attached_to = shard["node_attached"] - expected_az = env.get_pageserver(attached_to).az_id - - # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed - # in putting the tenant shards in the preferred AZ. - # To be fixed in https://github.com/neondatabase/neon/pull/9916 - # assert shard["preferred_az_id"] == expected_az + attached_in_az = env.get_pageserver(attached_to).az_id + assert shard["preferred_az_id"] == attached_in_az == "az-0" @run_only_on_default_postgres("Postgres version makes no difference here") diff --git a/test_runner/websocket_tunnel.py b/test_runner/websocket_tunnel.py new file mode 100755 index 0000000000..facdb19140 --- /dev/null +++ b/test_runner/websocket_tunnel.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# +# This program helps to test the WebSocket tunneling in proxy. It listens for a TCP +# connection on a port, and when you connect to it, it opens a websocket connection, +# and forwards all the traffic to the websocket connection, wrapped in WebSocket binary +# frames. +# +# This is used in the test_proxy::test_websockets test, but it is handy for manual testing too. +# +# Usage for manual testing: +# +# ## Launch Posgres on port 3000: +# postgres -D data -p3000 +# +# ## Launch proxy with WSS enabled: +# openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj '/CN=*.neon.localtest.me' +# ./target/debug/proxy --wss 127.0.0.1:40433 --http 127.0.0.1:28080 --mgmt 127.0.0.1:9099 --proxy 127.0.0.1:4433 --tls-key server.key --tls-cert server.crt --auth-backend postgres +# +# ## Launch the tunnel: +# +# poetry run ./test_runner/websocket_tunnel.py --ws-port 40433 --ws-url "wss://ep-test.neon.localtest.me" +# +# ## Now you can connect with psql: +# psql "postgresql://heikki@localhost:40433/postgres" +# + +import argparse +import asyncio +import logging +import ssl +from ssl import Purpose + +import websockets +from fixtures.log_helper import log + + +# Enable verbose logging of all the traffic +def enable_verbose_logging(): + logger = logging.getLogger("websockets") + logger.setLevel(logging.DEBUG) + logger.addHandler(logging.StreamHandler()) + + +async def start_server(tcp_listen_host, tcp_listen_port, ws_url, ctx): + server = await asyncio.start_server( + lambda r, w: handle_client(r, w, ws_url, ctx), tcp_listen_host, tcp_listen_port + ) + return server + + +async def handle_tcp_to_websocket(tcp_reader, ws): + try: + while not tcp_reader.at_eof(): + data = await tcp_reader.read(1024) + + await ws.send(data) + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_websocket_to_tcp(ws, tcp_writer): + try: + async for message in ws: + tcp_writer.write(message) + await tcp_writer.drain() + except websockets.exceptions.ConnectionClosedError as e: + log.debug(f"connection closed: {e}") + except websockets.exceptions.ConnectionClosedOK: + log.debug("connection closed") + except Exception as e: + log.error(e) + + +async def handle_client(tcp_reader, tcp_writer, ws_url: str, ctx: ssl.SSLContext): + try: + log.info("Received TCP connection. Connecting to websockets proxy.") + + async with websockets.connect(ws_url, ssl=ctx) as ws: + try: + log.info("Connected to websockets proxy") + + async with asyncio.TaskGroup() as tg: + task1 = tg.create_task(handle_tcp_to_websocket(tcp_reader, ws)) + task2 = tg.create_task(handle_websocket_to_tcp(ws, tcp_writer)) + + done, pending = await asyncio.wait( + [task1, task2], return_when=asyncio.FIRST_COMPLETED + ) + tcp_writer.close() + await ws.close() + + except* Exception as ex: + log.error(ex.exceptions) + except Exception as e: + log.error(e) + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--tcp-listen-addr", + default="localhost", + help="TCP addr to listen on", + ) + parser.add_argument( + "--tcp-listen-port", + default="40444", + help="TCP port to listen on", + ) + + parser.add_argument( + "--ws-url", + default="wss://localhost/", + help="websocket URL to connect to. This determines the Host header sent to the server", + ) + parser.add_argument( + "--ws-host", + default="127.0.0.1", + help="websockets host to connect to", + ) + parser.add_argument( + "--ws-port", + type=int, + default=443, + help="websockets port to connect to", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="enable verbose logging", + ) + args = parser.parse_args() + + if args.verbose: + enable_verbose_logging() + + ctx = ssl.create_default_context(Purpose.SERVER_AUTH) + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + server = await start_server(args.tcp_listen_addr, args.tcp_listen_port, args.ws_url, ctx) + print( + f"Listening for connections at {args.tcp_listen_addr}:{args.tcp_listen_port}, forwarding them to {args.ws_host}:{args.ws_port}" + ) + async with server: + await server.serve_forever() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c2f65b3201..210a0ba3af 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73 +Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f262d631ad..d3141e17a7 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f262d631ad477a1819e84a183e5a7ef561830085 +Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 97f9fde349..f63b141cfb 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185 +Subproject commit f63b141cfb0c813725a6b2574049565bff643018 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 7e3f3974bc..0f8da73ed0 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd +Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8 diff --git a/vendor/revisions.json b/vendor/revisions.json index bff2f70931..b4d57ab709 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "7e3f3974bc8895938308f94d0e96879ffae638cd" + "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8" ], "v16": [ "16.6", - "97f9fde349c6de6d573f5ce96db07eca60ce6185" + "f63b141cfb0c813725a6b2574049565bff643018" ], "v15": [ "15.10", - "f262d631ad477a1819e84a183e5a7ef561830085" + "d3141e17a7155e3d07c8deba4a10c748a29ba1e6" ], "v14": [ "14.15", - "c2f65b3201591e02ce45b66731392f98d3388e73" + "210a0ba3afd8134ea910b203f274b165bd4f05d7" ] }