diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index e18e6a1201..67152b6991 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -62,7 +62,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -83,6 +83,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -136,6 +140,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -148,6 +159,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: mold -run make postgres-v16 -j$(nproc) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: mold -run make postgres-v17 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -210,7 +225,7 @@ jobs: run: | PQ_LIB_DIR=$(pwd)/pg_install/v16/lib export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4bb9e5cb66..7c06fd9ab8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -211,7 +211,7 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -548,7 +548,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -627,7 +627,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v16' + if: matrix.version == 'v17' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -649,7 +649,7 @@ jobs: strategy: matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] steps: - uses: docker/login-action@v3 @@ -671,7 +671,7 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ @@ -689,7 +689,7 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -700,7 +700,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] env: VM_BUILDER_VERSION: v0.29.3 @@ -798,7 +798,7 @@ jobs: runs-on: ubuntu-22.04 env: - VERSIONS: v14 v15 v16 + VERSIONS: v14 v15 v16 v17 steps: - uses: docker/login-action@v3 @@ -839,7 +839,7 @@ jobs: done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Login to prod ECR uses: docker/login-action@v3 @@ -852,7 +852,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -864,7 +864,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -876,7 +876,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -971,7 +971,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -1117,6 +1117,7 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") + # TODO Add v17 for pg_version in v14 v15 v16; do # We run less tests for debug builds, so we don't need to promote them if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7fecdbde8c..41c9f5dee5 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 @@ -93,6 +97,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -120,6 +131,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: make postgres-v16 -j$(sysctl -n hw.ncpu) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: make postgres-v17 -j$(sysctl -n hw.ncpu) + - name: Build neon extensions run: make neon-pg-ext -j$(sysctl -n hw.ncpu) diff --git a/.gitmodules b/.gitmodules index 1d925674a1..d1330bf28c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,7 @@ path = vendor/postgres-v16 url = https://github.com/neondatabase/postgres.git branch = REL_16_STABLE_neon +[submodule "vendor/postgres-v17"] + path = vendor/postgres-v17 + url = https://github.com/neondatabase/postgres.git + branch = REL_17_STABLE_neon diff --git a/Dockerfile b/Dockerfile index 1efedfa9bc..bdb76a4f4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned +ARG DEFAULT_PG_VERSION=17 +ARG STABLE_PG_VERSION=16 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -13,6 +15,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 +COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG +ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib +COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -52,6 +58,7 @@ RUN set -e \ # Build final image # FROM debian:bullseye-slim +ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib VOLUME ["/data"] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b6c89cd71f..fe902eb978 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -79,15 +79,23 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /sfcgal && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis -RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti # ######################################################################################### FROM build-deps AS plv8-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts @@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t # ######################################################################################### FROM build-deps AS h3-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "$(uname -m)" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "$(uname -m)" in \ "x86_64") \ export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ ;; \ @@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /h3/usr/ && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ @@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # ######################################################################################### FROM build-deps AS unit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - # ######################################################################################### FROM build-deps AS vector-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch @@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ @@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O # ######################################################################################### FROM build-deps AS pgjwt-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 -RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 # ######################################################################################### FROM build-deps AS hypopg-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo # ######################################################################################### FROM build-deps AS pg-hashids-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # ######################################################################################### FROM build-deps AS rum-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/rum.patch /rum.patch -RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ @@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r # ######################################################################################### FROM build-deps AS pgtap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta # ######################################################################################### FROM build-deps AS ip4r-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # ######################################################################################### FROM build-deps AS prefix-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # ######################################################################################### FROM build-deps AS hll-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # ######################################################################################### FROM build-deps AS plpgsql-check-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ + "v17") \ + echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-cron-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O # ######################################################################################### FROM build-deps AS rdkit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y \ cmake \ libboost-iostreams1.74-dev \ @@ -507,7 +600,10 @@ RUN apt-get update && \ libeigen3-dev ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ @@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. # ######################################################################################### FROM build-deps AS pg-uuidv7-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz # ######################################################################################### FROM build-deps AS pg-roaringbitmap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # ######################################################################################### FROM build-deps AS pg-semver-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-anon-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -641,6 +756,7 @@ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag # ######################################################################################### FROM build-deps AS rust-extensions-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ @@ -651,9 +767,11 @@ ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -672,7 +790,10 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 @@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ # TODO update pgrx version in the pg_tiktoken repo and remove this line @@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - ######################################################################################### FROM build-deps AS wal2json-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS pg-ivm-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv # ######################################################################################### FROM build-deps AS pg-partman-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ case "${PG_VERSION}" in \ "v14" | "v15") \ ;; \ - "v16") \ - echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + "v16" | "v17") \ + echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ @@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN rm -r /usr/local/pgsql/include +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN rm /usr/local/pgsql/lib/lib*.a +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm /usr/local/pgsql/lib/lib*.a ######################################################################################### @@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN mkdir /ext-src +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr @@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src COPY patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch # cmake is required for the h3 test -RUN apt-get update && apt-get install -y cmake -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && apt-get install -y cmake +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 &str { "14" => return "v14", "15" => return "v15", "16" => return "v16", + "17" => return "v17", _ => {} }, _ => {} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5dbc3bcbbc..d616154af6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -342,7 +342,7 @@ impl LocalEnv { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index c715d6b789..2b714fbfbf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -28,6 +28,7 @@ use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId}, }; +use whoami::username; pub struct StorageController { env: LocalEnv, @@ -183,7 +184,7 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; for v in prefer_versions { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); @@ -211,7 +212,16 @@ impl StorageController { /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let args = [ + "-h", + "localhost", + "-U", + &username(), + "-d", + DB_NAME, + "-p", + &format!("{}", postgres_port), + ]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -225,7 +235,11 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!( + "postgresql://{}@localhost:{}/{DB_NAME}", + &username(), + postgres_port + ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -235,6 +249,10 @@ impl StorageController { "localhost", "-p", &format!("{}", postgres_port), + "-U", + &username(), + "-O", + &username(), DB_NAME, ]) .output() @@ -271,7 +289,7 @@ impl StorageController { // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 - .user(&whoami::username()) + .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await @@ -328,6 +346,12 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { + let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + tracing::info!( + "Initializing storage controller database with args: {:?}", + initdb_args + ); + // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) @@ -335,7 +359,7 @@ impl StorageController { ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) - .args(["-D", pg_data_path.as_ref()]) + .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; @@ -364,8 +388,14 @@ impl StorageController { pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), + "-U", + &username(), "start", ]; + tracing::info!( + "Starting storage controller database with args: {:?}", + db_start_args + ); background_process::start_process( "storage_controller_db", diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8929ccb41d..4a776709c9 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::Oid; use postgres_ffi::RepOriginId; -use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -350,7 +350,17 @@ impl Key { // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID +// +// 02 00000000 00000000 00XXXXXX XX XXXXXXXX +// +// \______XID_________/ +// +// The 64-bit XID is stored a little awkwardly in field6, field5 and +// field4. PostgreSQL v16 and below only stored a 32-bit XID, which +// fit completely in field6, but starting with PostgreSQL v17, a full +// 64-bit XID is used. Most pageserver code that accesses +// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits +// are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 @@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key { }; #[inline(always)] -pub fn twophase_file_key(xid: TransactionId) -> Key { +pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] -pub fn twophase_key_range(xid: TransactionId) -> Range { +pub fn twophase_key_range(xid: u64) -> Range { + // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, + field3: u32::from(overflowed), + field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, + field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index a346390f3d..d3a85f2683 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15", "v16"] { + for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f18e0c603b..0d46ed6aac 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions { $macro!(v14); $macro!(v15); $macro!(v16); + $macro!(v17); }; } @@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion { 14 : v14, 15 : v15, 16 : v16, + 17 : v17, ] ) }; @@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] ) }; @@ -150,6 +153,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; @@ -162,6 +166,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6ce855c78e..61b49a634d 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From heapam_xlog.h +pub const XLOG_HEAP2_REWRITE: u8 = 0x00; + // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; @@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; +/* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_LONG_HEADER: u16 = 0x0002; +pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; +pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; +/* xlog_internal.h */ +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_LONG_HEADER: u16 = 0x0002; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 = /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; -// List of subdirectories inside pgdata. -// Copied from src/bin/initdb/initdb.c -pub const PGDATA_SUBDIRS: [&str; 22] = [ - "global", - "pg_wal/archive_status", - "pg_commit_ts", - "pg_dynshmem", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_replslot", - "pg_tblspc", - "pg_stat", - "pg_stat_tmp", - "pg_xact", - "pg_logical", - "pg_logical/snapshots", - "pg_logical/mappings", -]; - // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 32f8f51114..fe01a5df7c 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 22] = [ + "global", + "pg_wal/archive_status", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 626a23c7ea..3cd1b7aec5 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 587be71cb3..31bd5b68fd 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs new file mode 100644 index 0000000000..2132938680 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -0,0 +1,55 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 23] = [ + "global", + "pg_wal/archive_status", + "pg_wal/summaries", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} + + +pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; +pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; +pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; + + +pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; +pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6052f04d11..949e3f4251 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -53,7 +53,7 @@ impl Conf { #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 28547f52bf..3f549889b8 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; +const WALPROPOSER_PG_VERSION: &str = "v17"; + fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> { // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); - let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); + let pg_config_bin = pg_install_abs + .join(WALPROPOSER_PG_VERSION) + .join("bin") + .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") @@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> { .into() } else { let server_path = pg_install_abs - .join("v16") + .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index a07107753e..adc090823d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option { return None; } let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); - let is_delta = if lsns.len() == 1 { - lsns.push(lsns[0]); + let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect(); + let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect(); + let the_lsns: [&str; 2]; + + /* + * Generations add a -vX-XXXXXX postfix, which causes issues when we try to + * parse 'vX' as an LSN. + */ + let is_delta = if lsns.len() == 1 || lsns[1].is_empty() { + the_lsns = [lsns[0], lsns[0]]; false } else { + the_lsns = [lsns[0], lsns[1]]; true }; let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); - let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap(); let holes = Vec::new(); Some(LayerFile { key_range, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 207f781e1b..a32d09f3b3 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; @@ -255,8 +254,11 @@ where let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + let pgversion = self.timeline.pg_version; + let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); + // Create pgdata subdirs structure - for dir in PGDATA_SUBDIRS.iter() { + for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, &mut io::empty()) @@ -606,7 +608,7 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { + async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) @@ -617,7 +619,11 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = format!("pg_twophase/{:>08X}", xid); + let path = if self.timeline.pg_version < 17 { + format!("pg_twophase/{:>08X}", xid) + } else { + format!("pg_twophase/{:>016X}", xid) + }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 29a98855d3..e9f197ec2d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -281,7 +281,7 @@ impl PageServerConf { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5a0894cd1b..ca87f1d080 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -580,9 +580,11 @@ async fn import_file( import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader).await?; + + // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, + // it's a 32-bit TransactionId, which fits in u64 anyway. + let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6dd8851b13..5f8766ca2c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -633,7 +633,7 @@ impl Timeline { pub(crate) async fn get_twophase_file( &self, - xid: TransactionId, + xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { @@ -646,11 +646,19 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - Ok(TwoPhaseDirectory::des(&buf)?.xids) + if self.pg_version >= 17 { + Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) + } else { + Ok(TwoPhaseDirectory::des(&buf)? + .xids + .iter() + .map(|x| u64::from(*x)) + .collect()) + } } pub(crate) async fn get_control_file( @@ -902,9 +910,13 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; - let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + + let mut xids: Vec = self + .list_twophase_files(lsn, ctx) + .await? + .iter() + .cloned() + .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); @@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> { // Create AuxFilesDirectory self.init_aux_dir()?; - let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { - xids: HashSet::new(), - })?; + let buf = if self.tline.pg_version >= 17 { + TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { + xids: HashSet::new(), + }) + } else { + TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + }) + }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); @@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> { pub async fn put_twophase_file( &mut self, - xid: TransactionId, + xid: u64, img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; - if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid = xid as u32; + let mut dir = TwoPhaseDirectory::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) @@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, - xid: TransactionId, + xid: u64, ctx: &RequestContext, ) -> anyhow::Result<()> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&buf)?; - if !dir.xids.remove(&xid) { - warn!("twophase file for xid {} does not exist", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid: u32 = u32::try_from(xid)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); @@ -2124,11 +2161,21 @@ struct DbDirectory { dbdirs: HashMap<(Oid, Oid), bool>, } +// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of +// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files +// were named like "pg_twophase/000002E5", now they're like +// "pg_twophsae/0000000A000002E4". + #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectory { xids: HashSet, } +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectoryV17 { + xids: HashSet, +} + #[derive(Debug, Serialize, Deserialize, Default)] struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 6e15ad81c3..229c01a681 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -237,6 +237,26 @@ impl WalIngest { .await?; } } + } else if pg_version == 17 { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } pg_constants::RM_TBLSPC_ID => { @@ -246,7 +266,11 @@ impl WalIngest { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -260,7 +284,7 @@ impl WalIngest { .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); + let xlrec = XlClogTruncate::decode(&mut buf, pg_version); self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } @@ -299,12 +323,21 @@ impl WalIngest { parsed_xact.xid, lsn, ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) - .await?; + + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(parsed_xact.xid)? + } else { + parsed_xact.xid as u64 + }; + modification.drop_twophase_file(xid, ctx).await?; } else if info == pg_constants::XLOG_XACT_PREPARE { + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(decoded.xl_xid)? + } else { + decoded.xl_xid as u64 + }; modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } @@ -312,7 +345,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -325,7 +362,11 @@ impl WalIngest { ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -354,6 +395,20 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_PARAMETER_CHANGE { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlParameterChange::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_END_OF_RECOVERY { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlEndOfRecovery::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); @@ -397,12 +452,24 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let mut oldest_active_xid = cp.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; + let oldest_active_xid = if pg_version >= 17 { + let mut oldest_active_full_xid = cp.nextXid.value; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if xid < oldest_active_full_xid { + oldest_active_full_xid = xid; + } } - } + oldest_active_full_xid as u32 + } else { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + let narrow_xid = xid as u32; + if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = narrow_xid; + } + } + oldest_active_xid + }; cp.oldestActiveXid = oldest_active_xid; } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; @@ -515,6 +582,25 @@ impl WalIngest { Ok(modification.len() > prev_len) } + /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + let next_full_xid = + enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); + + let next_xid = (next_full_xid) as u32; + let mut epoch = (next_full_xid >> 32) as u32; + + if xid > next_xid { + // Wraparound occurred, must be from a prev epoch. + if epoch == 0 { + bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + } + epoch -= 1; + } + + Ok((epoch as u64) << 32 | xid as u64) + } + /// Do not store this block, but observe it for the purposes of updating our relation size state. async fn observe_decoded_block( &mut self, @@ -815,6 +901,73 @@ impl WalIngest { bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 17 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v17::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v17::XlHeapDelete::decode(buf); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v17::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v17::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v17::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v17::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } _ => {} } @@ -923,26 +1076,26 @@ impl WalIngest { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 => { + 16 | 17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { - let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { - let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. @@ -958,7 +1111,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { @@ -974,7 +1127,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_LOCK => { - let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 0c4d575de8..dd199e2c55 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -174,6 +174,7 @@ impl DecodedWALRecord { } 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, _ => { panic!("Unsupported postgres version {pg_version}") } @@ -341,16 +342,47 @@ pub mod v14 { } } } + + #[repr(C)] + #[derive(Debug)] + pub struct XlParameterChange { + pub max_connections: i32, + pub max_worker_processes: i32, + pub max_wal_senders: i32, + pub max_prepared_xacts: i32, + pub max_locks_per_xact: i32, + pub wal_level: i32, + pub wal_log_hints: bool, + pub track_commit_timestamp: bool, + pub _padding: [u8; 2], + } + + impl XlParameterChange { + pub fn decode(buf: &mut Bytes) -> XlParameterChange { + XlParameterChange { + max_connections: buf.get_i32_le(), + max_worker_processes: buf.get_i32_le(), + max_wal_senders: buf.get_i32_le(), + max_prepared_xacts: buf.get_i32_le(), + max_locks_per_xact: buf.get_i32_le(), + wal_level: buf.get_i32_le(), + wal_log_hints: buf.get_u8() != 0, + track_commit_timestamp: buf.get_u8() != 0, + _padding: [buf.get_u8(), buf.get_u8()], + } + } + } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + XlParameterChange, }; } pub mod v16 { - pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use bytes::{Buf, Bytes}; use postgres_ffi::{OffsetNumber, TransactionId}; @@ -529,6 +561,37 @@ pub mod v16 { } } +pub mod v17 { + pub use super::v14::XlHeapLockUpdated; + use bytes::{Buf, Bytes}; + pub use postgres_ffi::{TimeLineID, TimestampTz}; + + pub use super::v16::rm_neon; + pub use super::v16::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + }; + + #[repr(C)] + #[derive(Debug)] + pub struct XlEndOfRecovery { + pub end_time: TimestampTz, + pub this_time_line_id: TimeLineID, + pub prev_time_line_id: TimeLineID, + pub wal_level: i32, + } + + impl XlEndOfRecovery { + pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { + XlEndOfRecovery { + end_time: buf.get_i64_le(), + this_time_line_id: buf.get_u32_le(), + prev_time_line_id: buf.get_u32_le(), + wal_level: buf.get_i32_le(), + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -746,9 +809,13 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { XlClogTruncate { - pageno: buf.get_u32_le(), + pageno: if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h new file mode 100644 index 0000000000..0a131816ef --- /dev/null +++ b/pgxn/neon/bitmap.h @@ -0,0 +1,12 @@ +#ifndef NEON_BITMAP_H +#define NEON_BITMAP_H + +/* + * Utilities for manipulating bits8* as bitmaps. + */ + +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + +#endif //NEON_BITMAP_H diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 479209a537..ab6739465b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -27,6 +27,7 @@ #include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" +#include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" @@ -40,6 +41,7 @@ #include "utils/guc.h" #include "hll.h" +#include "bitmap.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +int +lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + int nblocks, bits8 *bitmap) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 chunk_offs; + int found = 0; + uint32 hash; + int i = 0; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return 0; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + + LWLockAcquire(lfc_lock, LW_SHARED); + + while (true) + { + int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + if (LFC_ENABLED()) + { + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + { + if ((entry->bitmap[chunk_offs >> 5] & + (1 << (chunk_offs & 31))) != 0) + { + BITMAP_SET(bitmap, i); + found++; + } + } + } + else + { + i += this_chunk; + } + } + else + { + return found; + } + + /* + * Break out of the iteration before doing expensive stuff for + * a next iteration + */ + if (i + 1 >= nblocks) + break; + + /* + * Prepare for the next iteration. We don't unlock here, as that'd + * probably be more expensive than the gains it'd get us. + */ + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + } + + LWLockRelease(lfc_lock); + +#if USE_ASSERT_CHECKING + do { + int count = 0; + + for (int j = 0; j < nblocks; j++) + { + if (BITMAP_ISSET(bitmap, j)) + count++; + } + + Assert(count == found); + } while (false); +#endif + + return found; +} + /* * Evict a page (if present) from the local file cache */ @@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Try to read page from local cache. - * Returns true if page is found in local cache. - * In case of error local file cache is disabled (lfc->limit is set to zero). + * Try to read pages from local cache. + * Returns the number of pages read from the local cache, and sets bits in + * 'read' for the pages which were read. This may scribble over buffers not + * marked in 'read', so be careful with operation ordering. + * + * In case of error local file cache is disabled (lfc->limit is set to zero), + * and -1 is returned. Note that 'read' and the buffers may be touched and in + * an otherwise invalid state. + * + * If the mask argument is supplied, bits will be set at the offsets of pages + * that were present and read from the LFC. */ -bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +int +lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); bool result = true; uint32 hash; uint64 generation; uint32 entry_offset; + int blocks_read = 0; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return false; + return 0; if (!lfc_ensure_opened()) - return false; + return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int iteration_hits = 0; + int iteration_misses = 0; + Assert(blocks_in_chunk > 0); + + for (int i = 0; i < blocks_in_chunk; i++) + { + iov[i].iov_base = buffers[buf_offset + i]; + iov[i].iov_len = BLCKSZ; + } + + tag.blockNum = blkno - chunk_offs; + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* We can return the blocks we've read before LFC got disabled; + * assuming we read any. */ + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return blocks_read; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + + if (entry == NULL) + { + /* Pages are not cached */ + lfc_ctl->misses += blocks_in_chunk; + pgBufferUsage.file_cache.misses += blocks_in_chunk; + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + + continue; + } + + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + LWLockRelease(lfc_lock); - return false; - } - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + for (int i = 0; i < blocks_in_chunk; i++) + { + /* + * If the page is valid, we consider it "read". + * All other pages will be fetched separately by the next cache + */ + if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + { + BITMAP_SET(mask, buf_offset + i); + iteration_hits++; + } + else + iteration_misses++; + } - /* Approximate working set */ - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + Assert(iteration_hits + iteration_misses > 0); + + if (iteration_hits != 0) + { + rc = preadv(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + if (rc != (BLCKSZ * blocks_in_chunk)) + { + lfc_disable("read"); + return -1; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + lfc_ctl->hits += iteration_hits; + lfc_ctl->misses += iteration_misses; + pgBufferUsage.file_cache.hits += iteration_hits; + pgBufferUsage.file_cache.misses += iteration_misses; + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } + else + { + /* generation mismatch, assume error condition */ + LWLockRelease(lfc_lock); + return -1; + } - if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) - { - /* Page is not cached */ - lfc_ctl->misses += 1; - pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); - return false; - } - /* Unlink entry from LRU list to pin it for the duration of IO operation */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - generation = lfc_ctl->generation; - entry_offset = entry->offset; - LWLockRelease(lfc_lock); - - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("read"); - return false; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + blocks_read += iteration_hits; } - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - lfc_ctl->hits += 1; - pgBufferUsage.file_cache.hits += 1; - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - } - else - result = false; - - LWLockRelease(lfc_lock); - - return result; + return blocks_read; } /* @@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -#if PG_MAJORVERSION_NUM < 16 -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) -#endif +lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); uint32 hash; uint64 generation; uint32 entry_offset; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; @@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (!lfc_ensure_opened()) return; - tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return; - } + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + Assert(blocks_in_chunk > 0); - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - if (found) - { - /* - * Unlink entry from LRU list to pin it for the duration of IO - * operation - */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - } - else - { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + for (int i = 0; i < blocks_in_chunk; i++) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); + iov[i].iov_len = BLCKSZ; } - else if (!dlist_is_empty(&lfc_ctl->holes)) + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED()) { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool found; + LWLockRelease(lfc_lock); + return; + } - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); - CriticalAssert(found); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + if (found) + { + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); } else { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ - } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; - entry_offset = entry->offset; - lfc_ctl->writes += 1; - LWLockRelease(lfc_lock); - - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("write"); - } - else - { - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - /* Place entry to the head of LRU list */ - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - - entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + /* + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. + */ + if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ + } + entry->access_count = 1; + entry->hash = hash; + memset(entry->bitmap, 0, sizeof entry->bitmap); } + generation = lfc_ctl->generation; + entry_offset = entry->offset; + lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + + rc = pwritev(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + if (rc != BLCKSZ * blocks_in_chunk) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + for (int i = 0; i < blocks_in_chunk; i++) + { + entry->bitmap[(chunk_offs + i) >> 5] |= + (1 << ((chunk_offs + i) & 31)); + } + } + + LWLockRelease(lfc_lock); + } + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5126c26c5d..df7000acc0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel) /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); +#if PG_MAJORVERSION_NUM >= 17 + shard->wes_read = CreateWaitEventSet(NULL, 3); +#else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); +#endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index addb6ccce6..59b97d64fe 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -6,7 +6,11 @@ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H +#if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) +#else +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) +#endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ @@ -50,7 +54,7 @@ #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) tag.rnode @@ -98,7 +102,7 @@ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ @@ -113,4 +117,10 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#if PG_MAJORVERSION_NUM < 17 +#define ProcNumber BackendId +#define INVALID_PROC_NUMBER InvalidBackendId +#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) +#endif + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 1f196d016c..4c9e40a063 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -6,8 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * contrib/neon/pagestore_client.h - * *------------------------------------------------------------------------- */ #ifndef pageserver_h @@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg); * API */ -typedef unsigned shardno_t; +typedef uint16 shardno_t; typedef struct { @@ -211,7 +209,7 @@ extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); -extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbuffers, bool skipFsync); #endif +#if PG_MAJORVERSION_NUM >=17 +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif /* * LSN values associated with each request to the pageserver @@ -269,19 +272,11 @@ typedef struct } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, char *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); #else -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); #endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); @@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -#if PG_MAJORVERSION_NUM < 16 -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer); -#else -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer); -#endif -extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 7f39c7d026..36538ea5e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,6 +58,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" +#include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" @@ -66,6 +67,7 @@ #include "storage/smgr.h" #include "pagestore_client.h" +#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -170,16 +172,28 @@ typedef enum PrefetchStatus * valid */ } PrefetchStatus; +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_SEQ = 0x1, +} PrefetchRequestFlags; + typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ - PrefetchStatus status; - shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; +StaticAssertDecl(sizeof(PrefetchRequest) == 64, + "We prefer to have a power-of-2 size for this struct. Please" + " try to find an alternative solution before reaching to" + " increase the expected size here"); + /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -251,17 +265,17 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) -#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) -#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) - static PrefetchState *MyPState; +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + GetPrfSlotNoCheck(ring_index) \ ) \ ) @@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +#endif -static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); -static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks, const bits8 *mask); +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool @@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns if (force_request_lsns) slot->request_lsns = *force_request_lsns; else - slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum); + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1, NULL); request.req.lsn = slot->request_lsns.request_lsn; request.req.not_modified_since = slot->request_lsns.not_modified_since; @@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns */ static uint64 -prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) { - uint64 ring_index; + uint64 min_ring_index; PrefetchRequest req; - PrefetchRequest *slot; - PrfHashEntry *entry; +#if USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; + Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); - - if (entry != NULL) + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; - /* - * If the caller specified a request LSN to use, only accept prefetch - * responses that satisfy that request. - */ - if (force_request_lsns) - { - if (!neon_prefetch_response_usable(*force_request_lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - } +#if USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + req.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag)); + /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. */ - if (slot->status == PRFS_TAG_REMAINS) + if (lsns) { - prefetch_set_unused(ring_index); - entry = NULL; + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); } else { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } - } - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page unnecessarily - * in that case. If the oldest slot holds a request that we haven't - * received a response for yet, we have to wait for the response to that - * before we can continue. We might not have even flushed the request to - * the pageserver yet, it might be just sitting in the output buffer. In - * that case, we flush it and wait for the response. (We could decide not - * to send it, but it's hard to abort when the request is already in the - * output buffer, and 'not sending' a prefetch request kind of goes - * against the principles of prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); - } - } + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = req.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + + min_ring_index = Min(min_ring_index, ring_index); + + prefetch_do_request(slot, lsns); } - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + Assert(any_hits); - Assert(MyPState->ring_last <= ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = tag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - - prefetch_do_request(slot, force_request_lsns); - Assert(slot->status == PRFS_REQUESTED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -921,9 +996,17 @@ Retry: MyPState->ring_flush = MyPState->ring_unused; } - return ring_index; + return min_ring_index; } + +static uint64 +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +{ + return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL); +} + + /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. @@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } +#if PG_MAJORVERSION_NUM >= 17 +/* + * Wrapper around log_newpages() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, + BlockNumber nblocks, Page *pages, bool page_std) +{ + PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pageptrs[XLR_MAX_BLOCK_ID]; + int nregistered = 0; + XLogRecPtr result = 0; + + for (int i = 0; i < nblocks; i++) + { + Page page = copied_buffer[nregistered].data; + memcpy(page, pages[i], BLCKSZ); + pageptrs[nregistered] = page; + blknos[nregistered] = blkno + i; + + ++nregistered; + + if (nregistered >= XLR_MAX_BLOCK_ID) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + nregistered = 0; + } + } + + if (nregistered != 0) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + } + + return ProcLastRecPtr; +} +#endif /* PG_MAJORVERSION_NUM >= 17 */ + /* * Is 'buffer' identical to a freshly initialized empty heap page? */ @@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +#if PG_MAJORVERSION_NUM >= 17 +static void +neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, const char **buffers, bool force) +{ +#define BLOCK_BATCH_SIZE 16 + bool log_pages; + BlockNumber batch_blockno = blocknum; + XLogRecPtr lsns[BLOCK_BATCH_SIZE]; + int batch_size = 0; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + log_pages = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_pages = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_pages = true; + } + + if (log_pages) + { + XLogRecPtr recptr; + recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + nblocks, (Page *) buffers, false); + + for (int i = 0; i < nblocks; i++) + PageSetLSN(unconstify(char *, buffers[i]), recptr); + + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " + "were force logged, lsn=%X/%X", + blocknum, blocknum + nblocks, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(recptr)))); + } + + for (int i = 0; i < nblocks; i++) + { + Page page = (Page) buffers[i]; + BlockNumber blkno = blocknum + i; + XLogRecPtr lsn = PageGetLSN(page); + + if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (PageIsEmptyHeapPage(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) + { + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ + } + } + else + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + lsns[batch_size++] = lsn; + + if (batch_size >= BLOCK_BATCH_SIZE) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + batch_blockno += batch_size; + batch_size = 0; + } + } + + if (batch_size != 0) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + } +} +#endif + /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ -static void #if PG_MAJORVERSION_NUM < 16 +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { @@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn) return lsn; } + +/* + * Since PG17 we use vetorized version, + * so add compatibility function for older versions + */ +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); +} +#endif + /* * Return LSN for requesting pages and number of blocks from page server */ -static neon_request_lsns -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + neon_request_lsns *output, BlockNumber nblocks, + const bits8 *mask) { - XLogRecPtr last_written_lsn; - neon_request_lsns result; + XLogRecPtr last_written_lsns[PG_IOV_MAX]; - last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - last_written_lsn = nm_adjust_lsn(last_written_lsn); - Assert(last_written_lsn != InvalidXLogRecPtr); + Assert(nblocks <= PG_IOV_MAX); + + GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); + + for (int i = 0; i < nblocks; i++) + { + last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); + Assert(last_written_lsns[i] != InvalidXLogRecPtr); + } if (RecoveryInProgress()) { @@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - if (last_written_lsn > replay_lsn) + for (int i = 0; i < nblocks; i++) { - /* GetCurrentReplayRecPtr was introduced in v15 */ + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 - Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif - /* - * Cases 2 and 4. If this is a backend (case 4), the - * neon_read_at_lsn() call later will wait for the WAL record to be - * fully replayed. - */ - result.request_lsn = last_written_lsn; - } - else - { - /* cases 1 and 3 */ - result.request_lsn = replay_lsn; - } - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = result.request_lsn; - Assert(last_written_lsn <= result.request_lsn); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result->request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result->request_lsn = replay_lsn; + } - neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = result->request_lsn; + Assert(last_written_lsn <= result->request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); + } } else { XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache as the - * 'not_modified_since' hint. Any pages modified by later WAL records - * must still in the buffer cache, so our request cannot concern - * those. - */ - neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", - LSN_FORMAT_ARGS(last_written_lsn)); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index - * building, _bt_blwritepage logs the full page without flushing WAL - * before smgrextend (files are fsynced before build ends). - */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif - if (last_written_lsn > flushlsn) + + for (int i = 0; i < nblocks; i++) { - neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - LSN_FORMAT_ARGS(last_written_lsn), - LSN_FORMAT_ARGS(flushlsn)); - XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + /* + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. + */ + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ + if (last_written_lsn > flushlsn) + { + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; + } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result->request_lsn = UINT64_MAX; + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = flushlsn; } - - /* - * Request the very latest version of the page. In principle we - * want to read the page at the current insert LSN, and we could - * use that value in the request. However, there's a corner case - * with pageserver's garbage collection. If the GC horizon is - * set to a very small value, it's possible that by the time - * that the pageserver processes our request, the GC horizon has - * already moved past the LSN we calculate here. Standby servers - * always have that problem as the can always lag behind the - * primary, but for the primary we can avoid it by always - * requesting the latest page, by setting request LSN to - * UINT64_MAX. - * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. - */ - result.request_lsn = UINT64_MAX; - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = flushlsn; } - - return result; } /* @@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) * satisfy a page read now. */ static bool -neon_prefetch_response_usable(neon_request_lsns request_lsns, +neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); @@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; @@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, @@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2247,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } +#if PG_MAJORVERSION_NUM >= 17 +/* + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; + bool io_initiated = false; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum, nblocks); + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + tag.spcOid = reln->smgr_rlocator.locator.spcOid; + tag.dbOid = reln->smgr_rlocator.locator.dbOid; + tag.relNumber = reln->smgr_rlocator.locator.relNumber; + tag.forkNum = forknum; + + while (nblocks > 0) + { + int iterblocks = Min(nblocks, PG_IOV_MAX); + int seqlen = 0; + bits8 lfc_present[PG_IOV_MAX / 8]; + memset(lfc_present, 0, sizeof(lfc_present)); + + if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, + iterblocks, lfc_present) == iterblocks) + { + nblocks -= iterblocks; + blocknum += iterblocks; + continue; + } + + io_initiated = true; + + tag.blockNum = blocknum; + + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_present[i] = ~(lfc_present[i]); + + ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, + lfc_present); + nblocks -= iterblocks; + blocknum += iterblocks; + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + } + + return false; +} + + +#else /* PG_MAJORVERSION_NUM >= 17 */ /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -2285,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return false; } +#endif /* PG_MAJORVERSION_NUM < 17 */ + /* * neon_writeback() -- Tell the kernel to write pages back to storage. @@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ + /* + * TODO: WAL sync up to lwLsn for the indicated blocks + * Without that sync, writeback doesn't actually guarantee the data is + * persistently written, which does seem to be one of the assumed + * properties of this smgr API call. + */ neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL @@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * While function is defined in the neon extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void +static void #if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + char **buffers, BlockNumber nblocks, const bits8 *mask) #else -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, void *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) #endif { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - BufferTag buftag = - { - .forkNum = forkNum, - .blockNum = blkno, - }; + BufferTag buftag = {0}; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); CopyNRelFileInfoToBufTag(buftag, rinfo); + buftag.forkNum = forkNum; + buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are @@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ - if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsns.request_lsn); + prefetch_register_bufferv(buftag, request_lsns, nblocks, mask); - /* - * Try to find prefetched page in the list of received pages. - */ + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns[0].request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(request_lsns, slot)) + if (entry != NULL) { - ring_index = slot->my_ring_index; - pgBufferUsage.prefetch.hits += 1; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; + ring_index = slot->my_ring_index; + pgBufferUsage.prefetch.hits += 1; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - /* make it look like a prefetch cache miss */ - entry = NULL; } - } - do - { - if (entry == NULL) + do { - pgBufferUsage.prefetch.misses += 1; + if (entry == NULL) + { + pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsns); - slot = GetPrfSlot(ring_index); - } - else + ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rinfo, forkNum, blockno, buffer); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blkno, buffer); - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blkno, - RelFileInfoFmt(rinfo), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); } /* - * neon_read() -- Read the specified block from a relation. + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. */ void #if PG_MAJORVERSION_NUM < 16 +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, char *buffer) +#else +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, void *buffer) +#endif +{ + neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); +} + +#if PG_MAJORVERSION_NUM < 17 +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { @@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } #endif } +#endif /* PG_MAJORVERSION_NUM <= 16 */ + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + bits8 read[PG_IOV_MAX / 8]; + neon_request_lsns request_lsns[PG_IOV_MAX]; + int lfc_result; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (nblocks > PG_IOV_MAX) + neon_log(ERROR, "Read request too large: %d is larger than max %d", + nblocks, PG_IOV_MAX); + + memset(read, 0, sizeof(read)); + + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read); + + /* Read all blocks from LFC, so we're done */ + if (lfc_result == nblocks) + return; + + if (lfc_result == -1) + { + /* can't use the LFC result, so read all blocks from PS */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = 0xFF; + } + else + { + /* invert the result: exclude blocks read from lfc */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = ~(read[i]); + } + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks, read); + + neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + for (int i = 0; i < nblocks; i++) + { +#if PG_MAJORVERSION_NUM >= 17 + mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); +#else + mdread(reln, forkNum, blkno + i, mdbuf); +#endif + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew((Page) mdbuf)) + { + if (!PageIsNew((Page) pageserver_masked)) + { + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew((Page) buffer)) + { + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } + } +#endif +} +#endif #ifdef DEBUG_COMPARE_LOCAL static char * @@ -2623,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo if (mdexists(reln, forknum)) { /* It exists locally. Guess it's unlogged then. */ +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif + return; + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_page(reln, forknum, blocknum, buffer, false); + + lsn = PageGetLSN((Page) buffer); + neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif +#endif +} + + + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); /* * We could set relpersistence now that we have determined @@ -2641,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; - default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - neon_wallog_page(reln, forknum, blocknum, buffer, false); + neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lsn = PageGetLSN((Page) buffer); - neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, blocknum, - (uint32) (lsn >> 32), (uint32) lsn); - - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #endif } +#endif + /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ @@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, @@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, @@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) #endif } +#if PG_MAJORVERSION_NUM >= 17 +void +neon_regisersync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdregistersync(reln, forknum); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} +#endif + + /* * neon_start_unlogged_build() -- Starting build operation on a rel. * @@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn, - not_modified_since; + XLogRecPtr request_lsn, + not_modified_since; + SlruKind kind; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); - SlruKind kind; - - if (STRPREFIX(path, "pg_xact")) - kind = SLRU_CLOG; - else if (STRPREFIX(path, "pg_multixact/members")) - kind = SLRU_MULTIXACT_MEMBERS; - else if (STRPREFIX(path, "pg_multixact/offsets")) - kind = SLRU_MULTIXACT_OFFSETS; - else - return -1; + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, .req.lsn = request_lsn, .req.not_modified_since = not_modified_since, - .kind = kind, .segno = segno }; - int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do { while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); } while (resp == NULL); @@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = neon_prefetch, + .smgr_readv = neon_readv, + .smgr_writev = neon_writev, +#else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, +#endif + .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, - +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = neon_regisersync, +#endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, @@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, NRelFileInfo rinfo) +smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) + if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 65ef588ba5..4d0d06e6de 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -81,6 +81,7 @@ static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); @@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); -static process_interrupts_callback_t PrevProcessInterruptsCallback; +static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static shmem_startup_hook_type prev_shmem_startup_hook_type; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -178,7 +179,7 @@ pg_init_walproposer(void) nwp_prepare_shmem(); - delay_backend_us = &backpressure_lag_impl; + delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -352,6 +353,22 @@ backpressure_lag_impl(void) return 0; } +/* + * We don't apply backpressure when we're the postmaster, or the startup + * process, because in postmaster we can't apply backpressure, and in + * the startup process we can't afford to slow down. + */ +static uint64 +startup_backpressure_wrap(void) +{ + if (AmStartupProcess() || !IsUnderPostmaster) + return 0; + + delay_backend_us = &backpressure_lag_impl; + + return backpressure_lag_impl(); +} + /* * WalproposerShmemSize --- report amount of shared memory space needed */ @@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void) static bool backpressure_throttling_impl(void) { - int64 lag; + uint64 lag; TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + bool retry = false; + + if (PointerIsValid(PrevProcessInterruptsCallback)) + retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE @@ -602,7 +620,12 @@ walprop_pg_init_walsender(void) /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { +#if PG_MAJORVERSION_NUM >= 17 + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, + false, false, false); +#else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); +#endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); @@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ +#if PG_MAJORVERSION_NUM >= 17 + waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); +#else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); +#endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c index f327e132e9..66032c88f6 100644 --- a/pgxn/neon_rmgr/neon_rmgr_decode.c +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -1,6 +1,7 @@ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 + #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" @@ -9,6 +10,10 @@ #include "neon_rmgr.h" +#endif /* PG >= 16 */ + +#if PG_MAJORVERSION_NUM == 16 + /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); @@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } +#endif + +#if PG_MAJORVERSION_NUM == 17 + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); -#endif \ No newline at end of file +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + HeapTuple tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->t_tableOid = InvalidOid; + + tuple->t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, HeapTuple tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->t_len = datalen + SizeofHeapTupleHeader; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} +#endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 4e604a710c..a45e8f5c4a 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM >= 17 +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - +#if PG_MAJORVERSION_NUM >= 17 +static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); +#endif /* * inmem_init() -- Initialize private state @@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + return true; +} +#else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } +#endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. @@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ +#if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, -#if PG_MAJORVERSION_NUM < 16 char *buffer) #else +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { @@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, memcpy(buffer, page_body[pg], BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + void **buffers, BlockNumber nblocks) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_read(reln, forknum, blkno, buffers[i]); + } +} +#endif + /* * inmem_write() -- Write the supplied block at the appropriate location. * @@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, memcpy(page_body[pg], buffer, BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_write(reln, forknum, blkno, buffers[i], skipFsync); + } +} +#endif + /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ @@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_registersync(SMgrRelation reln, ForkNumber forknum) +{ +} +#endif + static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, @@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = inmem_prefetch, + .smgr_readv = inmem_readv, + .smgr_writev = inmem_writev, +#else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, +#endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, + +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = inmem_registersync, +#endif + + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, + .smgr_read_slru_segment = NULL, }; const f_smgr * -smgr_inmem(BackendId backend, NRelFileInfo rinfo) +smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rinfo); - else - return &inmem_smgr; + // // What does this code do? + // if (backend != INVALID_PROC_NUMBER) + // return smgr_standard(backend, rinfo); + // else + return &inmem_smgr; } void diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index 58b98b8e6a..91f1c80965 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index cc545393f5..219ca85207 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -100,6 +100,9 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#if PG_MAJORVERSION_NUM >= 17 +#include "storage/dsm_registry.h" +#endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -137,7 +140,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE DEBUG5 +#define TRACE LOG #ifdef HAVE_LIBSECCOMP @@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up xlog, clog, and buffers */ +#if PG_MAJORVERSION_NUM >= 17 + DSMRegistryShmemInit(); + VarsupShmemInit(); +#endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); @@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up other modules that need some shared memory space */ +#if PG_MAJORVERSION_NUM < 17 + /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); +#endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ @@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message) target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 064a678c96..d8390138c9 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -13,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 class Lsn: """ Datatype for an LSN. Internally it is a 64-bit integer, but the string - representation is like "1/123abcd". See also pg_lsn datatype in Postgres + representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ def __init__(self, x: Union[int, str]): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ee62372871..50284a3f5a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -933,8 +933,11 @@ class NeonEnvBuilder: for directory_to_clean in reversed(directories_to_clean): if not os.listdir(directory_to_clean): - log.debug(f"Removing empty directory {directory_to_clean}") - directory_to_clean.rmdir() + log.info(f"Removing empty directory {directory_to_clean}") + try: + directory_to_clean.rmdir() + except Exception as e: + log.error(f"Error removing empty directory {directory_to_clean}: {e}") def cleanup_remote_storage(self): for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: @@ -3423,6 +3426,7 @@ class VanillaPostgres(PgProtocol): assert not self.running with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) + conf_file.write("\n") def edit_hba(self, hba: List[str]): """Prepend hba lines into pg_hba.conf file.""" @@ -3476,6 +3480,7 @@ def vanilla_pg( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) yield vanilla_pg diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index e12c8e5f4a..258935959b 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" + V17 = "17" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json new file mode 100644 index 0000000000..7990b2c3a2 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG17 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b559be5f18..fb5c1d3115 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,6 +156,9 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -203,6 +206,9 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 27eb05ac09..7370eb1456 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -44,6 +44,8 @@ def test_remote_extensions( ): if pg_version == PgVersion.V16: pytest.skip("TODO: PG16 extension building") + if pg_version == PgVersion.V17: + pytest.skip("TODO: PG17 extension building") # setup mock http server # that expects request for anon.tar.zst diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 03e8c7c0df..4145a303c6 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): output = f.read().strip() # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". - pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers. + pattern = ( + r"postgres \(PostgreSQL\) (?P\d+(?:beta|rc|\.)\d+) \((?P[0-9a-f]{40})\)" + ) match = re.search(pattern, output, re.IGNORECASE) assert match is not None, f"Can't parse {output} with {pattern}" version = match.group("version") commit = match.group("commit") - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - - msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" - assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg + if "." in version: + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d152d0f41f..f98b53d966 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from( truncated_layers = 0 elif branchpoint == Branchpoint.AFTER_L0: branch_at = Lsn(last_lsn + 8) + # make sure the branch point is not on a page header + if 0 < (branch_at.lsn_int % 8192) < 40: + branch_at += 40 rows = 8192 # as there is no 8 byte walrecord, nothing should get copied from the straddling layer truncated_layers = 0 diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index ea900b07b8..ebe65e7c29 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,19 +1,32 @@ import os +from pathlib import Path +from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + fork_at_current_lsn, + import_timeline_from_vanilla_postgres, +) # # Test branching, when a transaction is in prepared state # -def test_twophase(neon_simple_env: NeonEnv): - env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) +def twophase_test_on_timeline(env: NeonEnv): + endpoint = env.endpoints.create_start( + "test_twophase", config_lines=["max_prepared_transactions=5"] + ) conn = endpoint.connect() cur = conn.cursor() + # FIXME: Switch to the next WAL segment, to work around the bug fixed in + # https://github.com/neondatabase/neon/pull/8914. When that is merged, this can be + # removed. + cur.execute("select pg_switch_wal()") + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row @@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") # Start compute on the new branch endpoint2 = env.endpoints.create_start( @@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv): # Only one committed insert is visible on the original branch cur.execute("SELECT * FROM foo") assert cur.fetchall() == [("three",)] + + +def test_twophase(neon_simple_env: NeonEnv): + """ + Test branching, when a transaction is in prepared state + """ + env = neon_simple_env + env.neon_cli.create_branch("test_twophase") + + twophase_test_on_timeline(env) + + +def test_twophase_nonzero_epoch( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs + have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in + pg_twophase filenames with PostgreSQL v17.) + """ + env = neon_simple_env + + # Reset the vanilla Postgres instance with a higher XID epoch + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + env.initial_tenant, + timeline_id, + "test_twophase", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() # don't need the original server anymore + + twophase_test_on_timeline(env) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 new file mode 160000 index 0000000000..9156d63ce2 --- /dev/null +++ b/vendor/postgres-v17 @@ -0,0 +1 @@ +Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa