From 78938d1b591b33d23495a0edb8b123cc5cac6a27 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 12 Sep 2024 23:18:41 +0100 Subject: [PATCH] [compute/postgres] feature: PostgreSQL 17 (#8573) This adds preliminary PG17 support to Neon, based on RC1 / 2024-09-04 https://github.com/postgres/postgres/commit/07b828e9d4aa916f1763774787440d914eea69c4 NOTICE: The data produced by the included version of the PostgreSQL fork may not be compatible with the future full release of PostgreSQL 17 due to expected or unexpected future changes in magic numbers and internals. DO NOT EXPECT DATA IN V17-TENANTS TO BE COMPATIBLE WITH THE 17.0 RELEASE! Co-authored-by: Anastasia Lubennikova Co-authored-by: Alexander Bayandin Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- .github/workflows/_build-and-test-locally.yml | 19 +- .github/workflows/build_and_test.yml | 27 +- .github/workflows/neon_extra_builds.yml | 15 + .gitmodules | 4 + Dockerfile | 12 +- Dockerfile.compute-node | 289 +++- Makefile | 56 +- compute_tools/src/compute.rs | 31 +- compute_tools/src/extension_server.rs | 1 + control_plane/src/local_env.rs | 2 +- control_plane/src/storage_controller.rs | 40 +- libs/pageserver_api/src/key.rs | 39 +- libs/postgres_ffi/build.rs | 2 +- libs/postgres_ffi/src/lib.rs | 5 + libs/postgres_ffi/src/pg_constants.rs | 39 +- libs/postgres_ffi/src/pg_constants_v14.rs | 27 + libs/postgres_ffi/src/pg_constants_v15.rs | 2 + libs/postgres_ffi/src/pg_constants_v16.rs | 2 + libs/postgres_ffi/src/pg_constants_v17.rs | 55 + libs/postgres_ffi/wal_craft/src/lib.rs | 2 +- libs/walproposer/build.rs | 9 +- pageserver/ctl/src/layer_map_analyzer.rs | 16 +- pageserver/src/basebackup.rs | 16 +- pageserver/src/config.rs | 2 +- pageserver/src/import_datadir.rs | 6 +- pageserver/src/pgdatadir_mapping.rs | 111 +- pageserver/src/walingest.rs | 191 ++- pageserver/src/walrecord.rs | 73 +- pgxn/neon/bitmap.h | 12 + pgxn/neon/file_cache.c | 504 ++++-- pgxn/neon/libpagestore.c | 4 + pgxn/neon/neon_pgversioncompat.h | 14 +- pgxn/neon/pagestore_client.h | 54 +- pgxn/neon/pagestore_smgr.c | 1360 ++++++++++++----- pgxn/neon/walproposer_pg.c | 39 +- pgxn/neon_rmgr/neon_rmgr_decode.c | 399 ++++- pgxn/neon_walredo/inmem_smgr.c | 79 +- pgxn/neon_walredo/inmem_smgr.h | 2 +- pgxn/neon_walredo/walredoproc.c | 14 +- test_runner/fixtures/common_types.py | 2 +- test_runner/fixtures/neon_fixtures.py | 9 +- test_runner/fixtures/pg_version.py | 1 + .../5670669815/v17/ext_index.json | 7 + test_runner/regress/test_compatibility.py | 8 +- .../regress/test_download_extensions.py | 2 + test_runner/regress/test_postgres_version.py | 17 +- .../regress/test_timeline_detach_ancestor.py | 3 + test_runner/regress/test_twophase.py | 70 +- vendor/postgres-v17 | 1 + 49 files changed, 2907 insertions(+), 787 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v17.rs create mode 100644 pgxn/neon/bitmap.h create mode 100644 test_runner/regress/data/extension_test/5670669815/v17/ext_index.json create mode 160000 vendor/postgres-v17 diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index e18e6a1201..67152b6991 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -62,7 +62,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -83,6 +83,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -136,6 +140,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -148,6 +159,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: mold -run make postgres-v16 -j$(nproc) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: mold -run make postgres-v17 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -210,7 +225,7 @@ jobs: run: | PQ_LIB_DIR=$(pwd)/pg_install/v16/lib export PQ_LIB_DIR - LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib export LD_LIBRARY_PATH #nextest does not yet support running doctests diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4bb9e5cb66..7c06fd9ab8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -211,7 +211,7 @@ jobs: build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds - pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }} + pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }} secrets: inherit # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking @@ -548,7 +548,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -627,7 +627,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once - if: matrix.version == 'v16' + if: matrix.version == 'v17' uses: docker/build-push-action@v6 with: target: compute-tools-image @@ -649,7 +649,7 @@ jobs: strategy: matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] steps: - uses: docker/login-action@v3 @@ -671,7 +671,7 @@ jobs: neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ @@ -689,7 +689,7 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v16' + if: matrix.version == 'v17' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -700,7 +700,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16 ] + version: [ v14, v15, v16, v17 ] env: VM_BUILDER_VERSION: v0.29.3 @@ -798,7 +798,7 @@ jobs: runs-on: ubuntu-22.04 env: - VERSIONS: v14 v15 v16 + VERSIONS: v14 v15 v16 v17 steps: - uses: docker/login-action@v3 @@ -839,7 +839,7 @@ jobs: done done docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ - neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Login to prod ECR uses: docker/login-action@v3 @@ -852,7 +852,7 @@ jobs: - name: Copy all images to prod ECR if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} done @@ -864,7 +864,7 @@ jobs: with: client_id: ${{ vars.AZURE_DEV_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -876,7 +876,7 @@ jobs: with: client_id: ${{ vars.AZURE_PROD_CLIENT_ID }} image_tag: ${{ needs.tag.outputs.build-tag }} - images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16 + images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17 registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }} tenant_id: ${{ vars.AZURE_TENANT_ID }} @@ -971,7 +971,7 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} - for r in 14 15 16; do + for r in 14 15 16 17; do git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" done @@ -1117,6 +1117,7 @@ jobs: files_to_promote+=("s3://${BUCKET}/${s3_key}") + # TODO Add v17 for pg_version in v14 v15 v16; do # We run less tests for debug builds, so we don't need to promote them if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7fecdbde8c..41c9f5dee5 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -72,6 +72,10 @@ jobs: id: pg_v16_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Set pg 17 revision for caching + id: pg_v17_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 @@ -93,6 +97,13 @@ jobs: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v17 build + id: cache_pg_17 + uses: actions/cache@v4 + with: + path: pg_install/v17 + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -120,6 +131,10 @@ jobs: if: steps.cache_pg_16.outputs.cache-hit != 'true' run: make postgres-v16 -j$(sysctl -n hw.ncpu) + - name: Build postgres v17 + if: steps.cache_pg_17.outputs.cache-hit != 'true' + run: make postgres-v17 -j$(sysctl -n hw.ncpu) + - name: Build neon extensions run: make neon-pg-ext -j$(sysctl -n hw.ncpu) diff --git a/.gitmodules b/.gitmodules index 1d925674a1..d1330bf28c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,7 @@ path = vendor/postgres-v16 url = https://github.com/neondatabase/postgres.git branch = REL_16_STABLE_neon +[submodule "vendor/postgres-v17"] + path = vendor/postgres-v17 + url = https://github.com/neondatabase/postgres.git + branch = REL_17_STABLE_neon diff --git a/Dockerfile b/Dockerfile index 1efedfa9bc..bdb76a4f4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned +ARG DEFAULT_PG_VERSION=17 +ARG STABLE_PG_VERSION=16 # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -13,6 +15,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 +COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -28,16 +31,19 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build WORKDIR /home/nonroot ARG GIT_VERSION=local ARG BUILD_TAG +ARG STABLE_PG_VERSION COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib +COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib COPY --chown=nonroot . . ARG ADDITIONAL_RUSTFLAGS RUN set -e \ - && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -52,6 +58,7 @@ RUN set -e \ # Build final image # FROM debian:bullseye-slim +ARG DEFAULT_PG_VERSION WORKDIR /data RUN set -e \ @@ -77,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ +COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. @@ -93,7 +101,7 @@ RUN mkdir -p /data/.neon/ && \ # When running a binary that links with libpq, default to using our most recent postgres version. Binaries # that want a particular postgres version will select it explicitly: this is just a default. -ENV LD_LIBRARY_PATH=/usr/local/v16/lib +ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib VOLUME ["/data"] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b6c89cd71f..fe902eb978 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -79,15 +79,23 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /sfcgal && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -96,7 +104,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -122,7 +133,10 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis -RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -142,12 +156,19 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti # ######################################################################################### FROM build-deps AS plv8-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ # generate and copy upgrade scripts @@ -172,9 +193,13 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.t # ######################################################################################### FROM build-deps AS h3-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "$(uname -m)" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "$(uname -m)" in \ "x86_64") \ export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ ;; \ @@ -192,7 +217,11 @@ RUN case "$(uname -m)" in \ && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + mkdir -p /h3/usr/ && \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ @@ -202,7 +231,10 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ @@ -218,9 +250,13 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3 # ######################################################################################### FROM build-deps AS unit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -239,6 +275,7 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - # ######################################################################################### FROM build-deps AS vector-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/pgvector.patch /pgvector.patch @@ -246,7 +283,10 @@ COPY patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ @@ -261,10 +301,14 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O # ######################################################################################### FROM build-deps AS pgjwt-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 -RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -277,9 +321,13 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214 # ######################################################################################### FROM build-deps AS hypopg-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -293,9 +341,13 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypo # ######################################################################################### FROM build-deps AS pg-hashids-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -309,11 +361,15 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz # ######################################################################################### FROM build-deps AS rum-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY patches/rum.patch /rum.patch -RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ patch -p1 < /rum.patch && \ @@ -328,9 +384,13 @@ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O r # ######################################################################################### FROM build-deps AS pgtap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -344,9 +404,13 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta # ######################################################################################### FROM build-deps AS ip4r-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -360,9 +424,13 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i # ######################################################################################### FROM build-deps AS prefix-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -376,9 +444,13 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p # ######################################################################################### FROM build-deps AS hll-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -392,9 +464,13 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar # ######################################################################################### FROM build-deps AS plpgsql-check-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -413,7 +489,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ @@ -446,7 +525,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -459,6 +541,9 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=16_1_6_0 \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ + "v17") \ + echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -478,10 +563,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-cron-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -495,9 +584,13 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O # ######################################################################################### FROM build-deps AS rdkit-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y \ cmake \ libboost-iostreams1.74-dev \ @@ -507,7 +600,10 @@ RUN apt-get update && \ libeigen3-dev ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ @@ -544,10 +640,14 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. # ######################################################################################### FROM build-deps AS pg-uuidv7-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -561,10 +661,14 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz # ######################################################################################### FROM build-deps AS pg-roaringbitmap-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -578,10 +682,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4 # ######################################################################################### FROM build-deps AS pg-semver-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -599,7 +707,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + case "${PG_VERSION}" in \ "v14" | "v15") \ export PG_EMBEDDING_VERSION=0.3.5 \ export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ @@ -620,10 +731,14 @@ RUN case "${PG_VERSION}" in \ # ######################################################################################### FROM build-deps AS pg-anon-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ @@ -641,6 +756,7 @@ RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tag # ######################################################################################### FROM build-deps AS rust-extensions-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ @@ -651,9 +767,11 @@ ENV HOME=/home/nonroot ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -672,7 +790,10 @@ USER root FROM rust-extensions-build AS pg-jsonschema-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \ echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8 @@ -694,7 +815,10 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar. FROM rust-extensions-build AS pg-graphql-pg-build ARG PG_VERSION -RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \ echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -714,7 +838,10 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ # TODO update pgrx version in the pg_tiktoken repo and remove this line @@ -733,7 +860,10 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6 FROM rust-extensions-build AS pg-pgx-ulid-build ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \ echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -748,10 +878,14 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz - ######################################################################################### FROM build-deps AS wal2json-pg-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -764,10 +898,14 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS pg-ivm-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -781,10 +919,14 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv # ######################################################################################### FROM build-deps AS pg-partman-build +ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -835,7 +977,10 @@ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ -RUN make -j $(getconf _NPROCESSORS_ONLN) \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon \ -s install && \ @@ -854,8 +999,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ case "${PG_VERSION}" in \ "v14" | "v15") \ ;; \ - "v16") \ - echo "Skipping HNSW for PostgreSQL 16" && exit 0 \ + "v16" | "v17") \ + echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \ ;; \ *) \ echo "unexpected PostgreSQL version" && exit 1 \ @@ -878,7 +1023,10 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto ######################################################################################### # @@ -899,15 +1047,24 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) -RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp # Remove headers that we won't need anymore - we've completed installation of all extensions -RUN rm -r /usr/local/pgsql/include +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm -r /usr/local/pgsql/include # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. -RUN rm /usr/local/pgsql/lib/lib*.a +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + rm /usr/local/pgsql/lib/lib*.a ######################################################################################### @@ -918,7 +1075,10 @@ RUN rm /usr/local/pgsql/lib/lib*.a FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN mkdir /ext-src +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr @@ -956,18 +1116,39 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src COPY patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN cd /ext-src/ && for f in *.tar.gz; \ +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch # cmake is required for the h3 test -RUN apt-get update && apt-get install -y cmake -RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + apt-get update && apt-get install -y cmake +RUN case "${PG_VERSION}" in "v17") \ + echo "v17 extensions are not supported yet. Quit" && exit 0;; \ + esac && \ + cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh -RUN patch -p1 &str { "14" => return "v14", "15" => return "v15", "16" => return "v16", + "17" => return "v17", _ => {} }, _ => {} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 5dbc3bcbbc..d616154af6 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -342,7 +342,7 @@ impl LocalEnv { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index c715d6b789..2b714fbfbf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -28,6 +28,7 @@ use utils::{ auth::{encode_from_key_file, Claims, Scope}, id::{NodeId, TenantId}, }; +use whoami::username; pub struct StorageController { env: LocalEnv, @@ -183,7 +184,7 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; for v in prefer_versions { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); @@ -211,7 +212,16 @@ impl StorageController { /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); - let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)]; + let args = [ + "-h", + "localhost", + "-U", + &username(), + "-d", + DB_NAME, + "-p", + &format!("{}", postgres_port), + ]; let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; Ok(exitcode.success()) @@ -225,7 +235,11 @@ impl StorageController { /// /// Returns the database url pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result { - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!( + "postgresql://{}@localhost:{}/{DB_NAME}", + &username(), + postgres_port + ); let pg_bin_dir = self.get_pg_bin_dir().await?; let createdb_path = pg_bin_dir.join("createdb"); @@ -235,6 +249,10 @@ impl StorageController { "localhost", "-p", &format!("{}", postgres_port), + "-U", + &username(), + "-O", + &username(), DB_NAME, ]) .output() @@ -271,7 +289,7 @@ impl StorageController { // But tokio-postgres fork doesn't have this upstream commit: // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79 // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399 - .user(&whoami::username()) + .user(&username()) .dbname(DB_NAME) .connect(tokio_postgres::NoTls) .await @@ -328,6 +346,12 @@ impl StorageController { let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { + let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()]; + tracing::info!( + "Initializing storage controller database with args: {:?}", + initdb_args + ); + // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) @@ -335,7 +359,7 @@ impl StorageController { ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), ]) - .args(["-D", pg_data_path.as_ref()]) + .args(initdb_args) .spawn() .expect("Failed to spawn initdb"); let status = child.wait().await?; @@ -364,8 +388,14 @@ impl StorageController { pg_data_path.as_ref(), "-l", pg_log_path.as_ref(), + "-U", + &username(), "start", ]; + tracing::info!( + "Starting storage controller database with args: {:?}", + db_start_args + ); background_process::start_process( "storage_controller_db", diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8929ccb41d..4a776709c9 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,8 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::Oid; use postgres_ffi::RepOriginId; -use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::{fmt, ops::Range}; @@ -350,7 +350,17 @@ impl Key { // 02 00000000 00000000 00000000 00 00000000 // // TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID +// +// 02 00000000 00000000 00XXXXXX XX XXXXXXXX +// +// \______XID_________/ +// +// The 64-bit XID is stored a little awkwardly in field6, field5 and +// field4. PostgreSQL v16 and below only stored a 32-bit XID, which +// fit completely in field6, but starting with PostgreSQL v17, a full +// 64-bit XID is used. Most pageserver code that accesses +// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits +// are just unused. // // ControlFile: // 03 00000000 00000000 00000000 00 00000000 @@ -582,35 +592,36 @@ pub const TWOPHASEDIR_KEY: Key = Key { }; #[inline(always)] -pub fn twophase_file_key(xid: TransactionId) -> Key { +pub fn twophase_file_key(xid: u64) -> Key { Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, } } #[inline(always)] -pub fn twophase_key_range(xid: TransactionId) -> Range { +pub fn twophase_key_range(xid: u64) -> Range { + // 64-bit XIDs really should not overflow let (next_xid, overflowed) = xid.overflowing_add(1); Key { field1: 0x02, field2: 0, field3: 0, - field4: 0, - field5: 0, - field6: xid, + field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((xid & 0x000000FF00000000) >> 32) as u8, + field6: (xid & 0x00000000FFFFFFFF) as u32, }..Key { field1: 0x02, field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, + field3: u32::from(overflowed), + field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32, + field5: ((next_xid & 0x000000FF00000000) >> 32) as u8, + field6: (next_xid & 0x00000000FFFFFFFF) as u32, } } diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index a346390f3d..d3a85f2683 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15", "v16"] { + for pg_version in &["v14", "v15", "v16", "v17"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f18e0c603b..0d46ed6aac 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -57,6 +57,7 @@ macro_rules! for_all_postgres_versions { $macro!(v14); $macro!(v15); $macro!(v16); + $macro!(v17); }; } @@ -91,6 +92,7 @@ macro_rules! dispatch_pgversion { 14 : v14, 15 : v15, 16 : v16, + 17 : v17, ] ) }; @@ -121,6 +123,7 @@ macro_rules! enum_pgversion_dispatch { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] ) }; @@ -150,6 +153,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; @@ -162,6 +166,7 @@ macro_rules! enum_pgversion { V14 : v14, V15 : v15, V16 : v16, + V17 : v17, ] } }; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6ce855c78e..61b49a634d 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -152,6 +152,9 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8; pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8; +// From heapam_xlog.h +pub const XLOG_HEAP2_REWRITE: u8 = 0x00; + // From replication/message.h pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00; @@ -219,15 +222,20 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; +/* pg_control.h */ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_LONG_HEADER: u16 = 0x0002; +pub const XLOG_PARAMETER_CHANGE: u8 = 0x60; +pub const XLOG_END_OF_RECOVERY: u8 = 0x90; /* From xlog.h */ pub const XLOG_REPLORIGIN_SET: u8 = 0x00; pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; +/* xlog_internal.h */ +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_LONG_HEADER: u16 = 0x0002; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -245,33 +253,6 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 = /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; -// List of subdirectories inside pgdata. -// Copied from src/bin/initdb/initdb.c -pub const PGDATA_SUBDIRS: [&str; 22] = [ - "global", - "pg_wal/archive_status", - "pg_commit_ts", - "pg_dynshmem", - "pg_notify", - "pg_serial", - "pg_snapshots", - "pg_subtrans", - "pg_twophase", - "pg_multixact", - "pg_multixact/members", - "pg_multixact/offsets", - "base", - "base/1", - "pg_replslot", - "pg_tblspc", - "pg_stat", - "pg_stat_tmp", - "pg_xact", - "pg_logical", - "pg_logical/snapshots", - "pg_logical/mappings", -]; - // Don't include postgresql.conf as it is inconvenient on node start: // we need postgresql.conf before basebackup to synchronize safekeepers // so no point in overwriting it during backup restore. Rest of the files diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 32f8f51114..fe01a5df7c 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -5,6 +5,33 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 22] = [ + "global", + "pg_wal/archive_status", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 } diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 626a23c7ea..3cd1b7aec5 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 587be71cb3..31bd5b68fd 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -11,6 +11,8 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ +pub use super::super::v14::bindings::PGDATA_SUBDIRS; + pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs new file mode 100644 index 0000000000..2132938680 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -0,0 +1,55 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +// List of subdirectories inside pgdata. +// Copied from src/bin/initdb/initdb.c +pub const PGDATA_SUBDIRS: [&str; 23] = [ + "global", + "pg_wal/archive_status", + "pg_wal/summaries", + "pg_commit_ts", + "pg_dynshmem", + "pg_notify", + "pg_serial", + "pg_snapshots", + "pg_subtrans", + "pg_twophase", + "pg_multixact", + "pg_multixact/members", + "pg_multixact/offsets", + "base", + "base/1", + "pg_replslot", + "pg_tblspc", + "pg_stat", + "pg_stat_tmp", + "pg_xact", + "pg_logical", + "pg_logical/snapshots", + "pg_logical/mappings", +]; + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} + + +pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10; +pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20; +pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30; + + +pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; +pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0; diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 6052f04d11..949e3f4251 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -53,7 +53,7 @@ impl Conf { #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index 28547f52bf..3f549889b8 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -5,6 +5,8 @@ use std::{env, path::PathBuf, process::Command}; use anyhow::{anyhow, Context}; +const WALPROPOSER_PG_VERSION: &str = "v17"; + fn main() -> anyhow::Result<()> { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=bindgen_deps.h"); @@ -36,7 +38,10 @@ fn main() -> anyhow::Result<()> { // Rebuild crate when libwalproposer.a changes println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); - let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); + let pg_config_bin = pg_install_abs + .join(WALPROPOSER_PG_VERSION) + .join("bin") + .join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) .arg("--includedir-server") @@ -53,7 +58,7 @@ fn main() -> anyhow::Result<()> { .into() } else { let server_path = pg_install_abs - .join("v16") + .join(WALPROPOSER_PG_VERSION) .join("include") .join("postgresql") .join("server") diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index a07107753e..adc090823d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -79,16 +79,24 @@ pub(crate) fn parse_filename(name: &str) -> Option { return None; } let keys: Vec<&str> = split[0].split('-').collect(); - let mut lsns: Vec<&str> = split[1].split('-').collect(); - let is_delta = if lsns.len() == 1 { - lsns.push(lsns[0]); + let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect(); + let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect(); + let the_lsns: [&str; 2]; + + /* + * Generations add a -vX-XXXXXX postfix, which causes issues when we try to + * parse 'vX' as an LSN. + */ + let is_delta = if lsns.len() == 1 || lsns[1].is_empty() { + the_lsns = [lsns[0], lsns[0]]; false } else { + the_lsns = [lsns[0], lsns[1]]; true }; let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); - let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap(); let holes = Vec::new(); Some(LayerFile { key_range, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 207f781e1b..a32d09f3b3 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -30,9 +30,8 @@ use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; -use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; -use postgres_ffi::TransactionId; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; @@ -255,8 +254,11 @@ where let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + let pgversion = self.timeline.pg_version; + let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]); + // Create pgdata subdirs structure - for dir in PGDATA_SUBDIRS.iter() { + for dir in subdirs.iter() { let header = new_tar_header_dir(dir)?; self.ar .append(&header, &mut io::empty()) @@ -606,7 +608,7 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { + async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) @@ -617,7 +619,11 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = format!("pg_twophase/{:>08X}", xid); + let path = if self.timeline.pg_version < 17 { + format!("pg_twophase/{:>08X}", xid) + } else { + format!("pg_twophase/{:>016X}", xid) + }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar .append(&header, &buf[..]) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 29a98855d3..e9f197ec2d 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -281,7 +281,7 @@ impl PageServerConf { #[allow(clippy::manual_range_patterns)] match pg_version { - 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5a0894cd1b..ca87f1d080 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -580,9 +580,11 @@ async fn import_file( import_slru(modification, slru, file_path, reader, len, ctx).await?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let xid = u32::from_str_radix(file_name.as_ref(), 16)?; - let bytes = read_all_bytes(reader).await?; + + // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions, + // it's a 32-bit TransactionId, which fits in u64 anyway. + let xid = u64::from_str_radix(file_name.as_ref(), 16)?; modification .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx) .await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6dd8851b13..5f8766ca2c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -633,7 +633,7 @@ impl Timeline { pub(crate) async fn get_twophase_file( &self, - xid: TransactionId, + xid: u64, lsn: Lsn, ctx: &RequestContext, ) -> Result { @@ -646,11 +646,19 @@ impl Timeline { &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result, PageReconstructError> { + ) -> Result, PageReconstructError> { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - Ok(TwoPhaseDirectory::des(&buf)?.xids) + if self.pg_version >= 17 { + Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) + } else { + Ok(TwoPhaseDirectory::des(&buf)? + .xids + .iter() + .map(|x| u64::from(*x)) + .collect()) + } } pub(crate) async fn get_control_file( @@ -902,9 +910,13 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; - let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + + let mut xids: Vec = self + .list_twophase_files(lsn, ctx) + .await? + .iter() + .cloned() + .collect(); xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); @@ -1127,9 +1139,15 @@ impl<'a> DatadirModification<'a> { // Create AuxFilesDirectory self.init_aux_dir()?; - let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { - xids: HashSet::new(), - })?; + let buf = if self.tline.pg_version >= 17 { + TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { + xids: HashSet::new(), + }) + } else { + TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + }) + }?; self.pending_directory_entries .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); @@ -1321,22 +1339,31 @@ impl<'a> DatadirModification<'a> { pub async fn put_twophase_file( &mut self, - xid: TransactionId, + xid: u64, img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; - if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid = xid as u32; + let mut dir = TwoPhaseDirectory::des(&dirbuf)?; + if !dir.xids.insert(xid) { + anyhow::bail!("twophase file for xid {} already exists", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); self.put(twophase_file_key(xid), Value::Image(img)); Ok(()) @@ -1639,22 +1666,32 @@ impl<'a> DatadirModification<'a> { /// This method is used for marking truncated SLRU files pub async fn drop_twophase_file( &mut self, - xid: TransactionId, + xid: u64, ctx: &RequestContext, ) -> anyhow::Result<()> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let mut dir = TwoPhaseDirectory::des(&buf)?; + let newdirbuf = if self.tline.pg_version >= 17 { + let mut dir = TwoPhaseDirectoryV17::des(&buf)?; - if !dir.xids.remove(&xid) { - warn!("twophase file for xid {} does not exist", xid); - } - self.pending_directory_entries - .push((DirectoryKind::TwoPhase, dir.xids.len())); - self.put( - TWOPHASEDIR_KEY, - Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), - ); + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) + } else { + let xid: u32 = u32::try_from(xid)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); + Bytes::from(TwoPhaseDirectory::ser(&dir)?) + }; + self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf)); // Delete it self.delete(twophase_key_range(xid)); @@ -2124,11 +2161,21 @@ struct DbDirectory { dbdirs: HashMap<(Oid, Oid), bool>, } +// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of +// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs. Previously, the files +// were named like "pg_twophase/000002E5", now they're like +// "pg_twophsae/0000000A000002E4". + #[derive(Debug, Serialize, Deserialize)] struct TwoPhaseDirectory { xids: HashSet, } +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectoryV17 { + xids: HashSet, +} + #[derive(Debug, Serialize, Deserialize, Default)] struct RelDirectory { // Set of relations that exist. (relfilenode, forknum) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 6e15ad81c3..229c01a681 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -237,6 +237,26 @@ impl WalIngest { .await?; } } + } else if pg_version == 17 { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } pg_constants::RM_TBLSPC_ID => { @@ -246,7 +266,11 @@ impl WalIngest { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -260,7 +284,7 @@ impl WalIngest { .await?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); - let xlrec = XlClogTruncate::decode(&mut buf); + let xlrec = XlClogTruncate::decode(&mut buf, pg_version); self.ingest_clog_truncate_record(modification, &xlrec, ctx) .await?; } @@ -299,12 +323,21 @@ impl WalIngest { parsed_xact.xid, lsn, ); - modification - .drop_twophase_file(parsed_xact.xid, ctx) - .await?; + + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(parsed_xact.xid)? + } else { + parsed_xact.xid as u64 + }; + modification.drop_twophase_file(xid, ctx).await?; } else if info == pg_constants::XLOG_XACT_PREPARE { + let xid: u64 = if pg_version >= 17 { + self.adjust_to_full_transaction_id(decoded.xl_xid)? + } else { + decoded.xl_xid as u64 + }; modification - .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx) + .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx) .await?; } } @@ -312,7 +345,11 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -325,7 +362,11 @@ impl WalIngest { ) .await?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = buf.get_u32_le(); + let pageno = if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; self.put_slru_page_image( @@ -354,6 +395,20 @@ impl WalIngest { pg_constants::RM_XLOG_ID => { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_PARAMETER_CHANGE { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlParameterChange::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } else if info == pg_constants::XLOG_END_OF_RECOVERY { + if let CheckPoint::V17(cp) = &mut self.checkpoint { + let rec = v17::XlEndOfRecovery::decode(&mut buf); + cp.wal_level = rec.wal_level; + self.checkpoint_modified = true; + } + } + enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { if info == pg_constants::XLOG_NEXTOID { let next_oid = buf.get_u32_le(); @@ -397,12 +452,24 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let mut oldest_active_xid = cp.nextXid.value as u32; - for xid in modification.tline.list_twophase_files(lsn, ctx).await? { - if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { - oldest_active_xid = xid; + let oldest_active_xid = if pg_version >= 17 { + let mut oldest_active_full_xid = cp.nextXid.value; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if xid < oldest_active_full_xid { + oldest_active_full_xid = xid; + } } - } + oldest_active_full_xid as u32 + } else { + let mut oldest_active_xid = cp.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + let narrow_xid = xid as u32; + if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = narrow_xid; + } + } + oldest_active_xid + }; cp.oldestActiveXid = oldest_active_xid; } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; @@ -515,6 +582,25 @@ impl WalIngest { Ok(modification.len() > prev_len) } + /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + let next_full_xid = + enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); + + let next_xid = (next_full_xid) as u32; + let mut epoch = (next_full_xid >> 32) as u32; + + if xid > next_xid { + // Wraparound occurred, must be from a prev epoch. + if epoch == 0 { + bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"); + } + epoch -= 1; + } + + Ok((epoch as u64) << 32 | xid as u64) + } + /// Do not store this block, but observe it for the purposes of updating our relation size state. async fn observe_decoded_block( &mut self, @@ -815,6 +901,73 @@ impl WalIngest { bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } + 17 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v17::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v17::XlHeapDelete::decode(buf); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v17::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_LOCK { + let xlrec = v17::XlHeapLock::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v17::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED { + let xlrec = v17::XlHeapLockUpdated::decode(buf); + if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } _ => {} } @@ -923,26 +1076,26 @@ impl WalIngest { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 => { + 16 | 17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { pg_constants::XLOG_NEON_HEAP_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf); assert_eq!(0, buf.remaining()); if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_DELETE => { - let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf); if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { new_heap_blkno = Some(decoded.blocks[0].blkno); } } pg_constants::XLOG_NEON_HEAP_UPDATE | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { - let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf); // the size of tuple data is inferred from the size of the record. // we can't validate the remaining number of bytes without parsing // the tuple data. @@ -958,7 +1111,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { - let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf); let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { @@ -974,7 +1127,7 @@ impl WalIngest { } } pg_constants::XLOG_NEON_HEAP_LOCK => { - let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf); + let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf); if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 { old_heap_blkno = Some(decoded.blocks[0].blkno); flags = pg_constants::VISIBILITYMAP_ALL_FROZEN; diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 0c4d575de8..dd199e2c55 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -174,6 +174,7 @@ impl DecodedWALRecord { } 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, _ => { panic!("Unsupported postgres version {pg_version}") } @@ -341,16 +342,47 @@ pub mod v14 { } } } + + #[repr(C)] + #[derive(Debug)] + pub struct XlParameterChange { + pub max_connections: i32, + pub max_worker_processes: i32, + pub max_wal_senders: i32, + pub max_prepared_xacts: i32, + pub max_locks_per_xact: i32, + pub wal_level: i32, + pub wal_log_hints: bool, + pub track_commit_timestamp: bool, + pub _padding: [u8; 2], + } + + impl XlParameterChange { + pub fn decode(buf: &mut Bytes) -> XlParameterChange { + XlParameterChange { + max_connections: buf.get_i32_le(), + max_worker_processes: buf.get_i32_le(), + max_wal_senders: buf.get_i32_le(), + max_prepared_xacts: buf.get_i32_le(), + max_locks_per_xact: buf.get_i32_le(), + wal_level: buf.get_i32_le(), + wal_log_hints: buf.get_u8() != 0, + track_commit_timestamp: buf.get_u8() != 0, + _padding: [buf.get_u8(), buf.get_u8()], + } + } + } } pub mod v15 { pub use super::v14::{ XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate, + XlParameterChange, }; } pub mod v16 { - pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert}; + pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; use bytes::{Buf, Bytes}; use postgres_ffi::{OffsetNumber, TransactionId}; @@ -529,6 +561,37 @@ pub mod v16 { } } +pub mod v17 { + pub use super::v14::XlHeapLockUpdated; + use bytes::{Buf, Bytes}; + pub use postgres_ffi::{TimeLineID, TimestampTz}; + + pub use super::v16::rm_neon; + pub use super::v16::{ + XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange, + }; + + #[repr(C)] + #[derive(Debug)] + pub struct XlEndOfRecovery { + pub end_time: TimestampTz, + pub this_time_line_id: TimeLineID, + pub prev_time_line_id: TimeLineID, + pub wal_level: i32, + } + + impl XlEndOfRecovery { + pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery { + XlEndOfRecovery { + end_time: buf.get_i64_le(), + this_time_line_id: buf.get_u32_le(), + prev_time_line_id: buf.get_u32_le(), + wal_level: buf.get_i32_le(), + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -746,9 +809,13 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { XlClogTruncate { - pageno: buf.get_u32_le(), + pageno: if pg_version < 17 { + buf.get_u32_le() + } else { + buf.get_u64_le() as u32 + }, oldest_xid: buf.get_u32_le(), oldest_xid_db: buf.get_u32_le(), } diff --git a/pgxn/neon/bitmap.h b/pgxn/neon/bitmap.h new file mode 100644 index 0000000000..0a131816ef --- /dev/null +++ b/pgxn/neon/bitmap.h @@ -0,0 +1,12 @@ +#ifndef NEON_BITMAP_H +#define NEON_BITMAP_H + +/* + * Utilities for manipulating bits8* as bitmaps. + */ + +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + +#endif //NEON_BITMAP_H diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 479209a537..ab6739465b 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -27,6 +27,7 @@ #include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" +#include "port/pg_iovec.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR #include "storage/buf_internals.h" @@ -40,6 +41,7 @@ #include "utils/guc.h" #include "hll.h" +#include "bitmap.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -469,6 +471,99 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) return found; } +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +int +lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + int nblocks, bits8 *bitmap) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 chunk_offs; + int found = 0; + uint32 hash; + int i = 0; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return 0; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + + LWLockAcquire(lfc_lock, LW_SHARED); + + while (true) + { + int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + if (LFC_ENABLED()) + { + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + if (entry != NULL) + { + for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++) + { + if ((entry->bitmap[chunk_offs >> 5] & + (1 << (chunk_offs & 31))) != 0) + { + BITMAP_SET(bitmap, i); + found++; + } + } + } + else + { + i += this_chunk; + } + } + else + { + return found; + } + + /* + * Break out of the iteration before doing expensive stuff for + * a next iteration + */ + if (i + 1 >= nblocks) + break; + + /* + * Prepare for the next iteration. We don't unlock here, as that'd + * probably be more expensive than the gains it'd get us. + */ + tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1); + } + + LWLockRelease(lfc_lock); + +#if USE_ASSERT_CHECKING + do { + int count = 0; + + for (int j = 0; j < nblocks; j++) + { + if (BITMAP_ISSET(bitmap, j)) + count++; + } + + Assert(count == found); + } while (false); +#endif + + return found; +} + /* * Evict a page (if present) from the local file cache */ @@ -548,91 +643,171 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) } /* - * Try to read page from local cache. - * Returns true if page is found in local cache. - * In case of error local file cache is disabled (lfc->limit is set to zero). + * Try to read pages from local cache. + * Returns the number of pages read from the local cache, and sets bits in + * 'read' for the pages which were read. This may scribble over buffers not + * marked in 'read', so be careful with operation ordering. + * + * In case of error local file cache is disabled (lfc->limit is set to zero), + * and -1 is returned. Note that 'read' and the buffers may be touched and in + * an otherwise invalid state. + * + * If the mask argument is supplied, bits will be set at the offsets of pages + * that were present and read from the LFC. */ -bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer) +int +lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void **buffers, BlockNumber nblocks, bits8 *mask) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); bool result = true; uint32 hash; uint64 generation; uint32 entry_offset; + int blocks_read = 0; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ - return false; + return 0; if (!lfc_ensure_opened()) - return false; + return 0; CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + int iteration_hits = 0; + int iteration_misses = 0; + Assert(blocks_in_chunk > 0); + + for (int i = 0; i < blocks_in_chunk; i++) + { + iov[i].iov_base = buffers[buf_offset + i]; + iov[i].iov_len = BLCKSZ; + } + + tag.blockNum = blkno - chunk_offs; + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + /* We can return the blocks we've read before LFC got disabled; + * assuming we read any. */ + if (!LFC_ENABLED()) + { + LWLockRelease(lfc_lock); + return blocks_read; + } + + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set for the blocks assumed in this entry */ + for (int i = 0; i < blocks_in_chunk; i++) + { + tag.blockNum = blkno + i; + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + } + + if (entry == NULL) + { + /* Pages are not cached */ + lfc_ctl->misses += blocks_in_chunk; + pgBufferUsage.file_cache.misses += blocks_in_chunk; + LWLockRelease(lfc_lock); + + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + + continue; + } + + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); + + generation = lfc_ctl->generation; + entry_offset = entry->offset; + LWLockRelease(lfc_lock); - return false; - } - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + for (int i = 0; i < blocks_in_chunk; i++) + { + /* + * If the page is valid, we consider it "read". + * All other pages will be fetched separately by the next cache + */ + if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32))) + { + BITMAP_SET(mask, buf_offset + i); + iteration_hits++; + } + else + iteration_misses++; + } - /* Approximate working set */ - tag.blockNum = blkno; - addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + Assert(iteration_hits + iteration_misses > 0); + + if (iteration_hits != 0) + { + rc = preadv(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + + if (rc != (BLCKSZ * blocks_in_chunk)) + { + lfc_disable("read"); + return -1; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + lfc_ctl->hits += iteration_hits; + lfc_ctl->misses += iteration_misses; + pgBufferUsage.file_cache.hits += iteration_hits; + pgBufferUsage.file_cache.misses += iteration_misses; + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + } + else + { + /* generation mismatch, assume error condition */ + LWLockRelease(lfc_lock); + return -1; + } - if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) - { - /* Page is not cached */ - lfc_ctl->misses += 1; - pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); - return false; - } - /* Unlink entry from LRU list to pin it for the duration of IO operation */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - generation = lfc_ctl->generation; - entry_offset = entry->offset; - LWLockRelease(lfc_lock); - - rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("read"); - return false; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; + blkno += blocks_in_chunk; + blocks_read += iteration_hits; } - /* Place entry to the head of LRU list */ - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - lfc_ctl->hits += 1; - pgBufferUsage.file_cache.hits += 1; - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - } - else - result = false; - - LWLockRelease(lfc_lock); - - return result; + return blocks_read; } /* @@ -640,20 +815,17 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -#if PG_MAJORVERSION_NUM < 16 -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer) -#endif +lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *const *buffers, BlockNumber nblocks) { BufferTag tag; FileCacheEntry *entry; ssize_t rc; bool found; - int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); uint32 hash; uint64 generation; uint32 entry_offset; + int buf_offset = 0; if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ return; @@ -661,110 +833,142 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void if (!lfc_ensure_opened()) return; - tag.forkNum = forkNum; - tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); - hash = get_hash_value(lfc_hash, &tag); - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (!LFC_ENABLED()) + /* + * For every chunk that has blocks we're interested in, we + * 1. get the chunk header + * 2. Check if the chunk actually has the blocks we're interested in + * 3. Read the blocks we're looking for (in one preadv), assuming they exist + * 4. Update the statistics for the read call. + * + * If there is an error, we do an early return. + */ + while (nblocks > 0) { - LWLockRelease(lfc_lock); - return; - } + struct iovec iov[PG_IOV_MAX]; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); + int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + Assert(blocks_in_chunk > 0); - entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - - if (found) - { - /* - * Unlink entry from LRU list to pin it for the duration of IO - * operation - */ - if (entry->access_count++ == 0) - dlist_delete(&entry->list_node); - } - else - { - /* - * We have two choices if all cache pages are pinned (i.e. used in IO - * operations): - * - * 1) Wait until some of this operation is completed and pages is - * unpinned. - * - * 2) Allocate one more chunk, so that specified cache size is more - * recommendation than hard limit. - * - * As far as probability of such event (that all pages are pinned) is - * considered to be very very small: there are should be very large - * number of concurrent IO operations and them are limited by - * max_connections, we prefer not to complicate code and use second - * approach. - */ - if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + for (int i = 0; i < blocks_in_chunk; i++) { - /* Cache overflow: evict least recently used chunk */ - FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); - - CriticalAssert(victim->access_count == 0); - entry->offset = victim->offset; /* grab victim's chunk */ - hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - neon_log(DEBUG2, "Swap file cache page"); + iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]); + iov[i].iov_len = BLCKSZ; } - else if (!dlist_is_empty(&lfc_ctl->holes)) + + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (!LFC_ENABLED()) { - /* We can reuse a hole that was left behind when the LFC was shrunk previously */ - FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); - uint32 offset = hole->offset; - bool found; + LWLockRelease(lfc_lock); + return; + } - hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); - CriticalAssert(found); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); - lfc_ctl->used += 1; - entry->offset = offset; /* reuse the hole */ + if (found) + { + /* + * Unlink entry from LRU list to pin it for the duration of IO + * operation + */ + if (entry->access_count++ == 0) + dlist_delete(&entry->list_node); } else { - lfc_ctl->used += 1; - entry->offset = lfc_ctl->size++; /* allocate new chunk at end - * of file */ - } - entry->access_count = 1; - entry->hash = hash; - memset(entry->bitmap, 0, sizeof entry->bitmap); - } - - generation = lfc_ctl->generation; - entry_offset = entry->offset; - lfc_ctl->writes += 1; - LWLockRelease(lfc_lock); - - rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); - if (rc != BLCKSZ) - { - lfc_disable("write"); - } - else - { - LWLockAcquire(lfc_lock, LW_EXCLUSIVE); - - if (lfc_ctl->generation == generation) - { - CriticalAssert(LFC_ENABLED()); - /* Place entry to the head of LRU list */ - CriticalAssert(entry->access_count > 0); - if (--entry->access_count == 0) - dlist_push_tail(&lfc_ctl->lru, &entry->list_node); - - entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + /* + * We have two choices if all cache pages are pinned (i.e. used in IO + * operations): + * + * 1) Wait until some of this operation is completed and pages is + * unpinned. + * + * 2) Allocate one more chunk, so that specified cache size is more + * recommendation than hard limit. + * + * As far as probability of such event (that all pages are pinned) is + * considered to be very very small: there are should be very large + * number of concurrent IO operations and them are limited by + * max_connections, we prefer not to complicate code and use second + * approach. + */ + if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru)); + + CriticalAssert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); + neon_log(DEBUG2, "Swap file cache page"); + } + else if (!dlist_is_empty(&lfc_ctl->holes)) + { + /* We can reuse a hole that was left behind when the LFC was shrunk previously */ + FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes)); + uint32 offset = hole->offset; + bool found; + + hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found); + CriticalAssert(found); + + lfc_ctl->used += 1; + entry->offset = offset; /* reuse the hole */ + } + else + { + lfc_ctl->used += 1; + entry->offset = lfc_ctl->size++; /* allocate new chunk at end + * of file */ + } + entry->access_count = 1; + entry->hash = hash; + memset(entry->bitmap, 0, sizeof entry->bitmap); } + generation = lfc_ctl->generation; + entry_offset = entry->offset; + lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + + rc = pwritev(lfc_desc, iov, blocks_in_chunk, + ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + if (rc != BLCKSZ * blocks_in_chunk) + { + lfc_disable("write"); + } + else + { + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + + if (lfc_ctl->generation == generation) + { + CriticalAssert(LFC_ENABLED()); + /* Place entry to the head of LRU list */ + CriticalAssert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->list_node); + + for (int i = 0; i < blocks_in_chunk; i++) + { + entry->bitmap[(chunk_offs + i) >> 5] |= + (1 << ((chunk_offs + i) & 31)); + } + } + + LWLockRelease(lfc_lock); + } + blkno += blocks_in_chunk; + buf_offset += blocks_in_chunk; + nblocks -= blocks_in_chunk; } } diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 5126c26c5d..df7000acc0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -537,7 +537,11 @@ pageserver_connect(shardno_t shard_no, int elevel) /* No more polling needed; connection succeeded */ shard->last_connect_time = GetCurrentTimestamp(); +#if PG_MAJORVERSION_NUM >= 17 + shard->wes_read = CreateWaitEventSet(NULL, 3); +#else shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); +#endif AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index addb6ccce6..59b97d64fe 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -6,7 +6,11 @@ #ifndef NEON_PGVERSIONCOMPAT_H #define NEON_PGVERSIONCOMPAT_H +#if PG_MAJORVERSION_NUM < 17 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) +#else +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER) +#endif #define RelFileInfoEquals(a, b) ( \ NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ @@ -50,7 +54,7 @@ #define CopyNRelFileInfoToBufTag(tag, rinfo) \ do { \ (tag).rnode = (rinfo); \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) tag.rnode @@ -98,7 +102,7 @@ (tag).spcOid = (rinfo).spcOid; \ (tag).dbOid = (rinfo).dbOid; \ (tag).relNumber = (rinfo).relNumber; \ - } while (false); + } while (false) #define BufTagGetNRelFileInfo(tag) \ ((RelFileLocator) { \ @@ -113,4 +117,10 @@ #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#if PG_MAJORVERSION_NUM < 17 +#define ProcNumber BackendId +#define INVALID_PROC_NUMBER InvalidBackendId +#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess()) +#endif + #endif /* NEON_PGVERSIONCOMPAT_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 1f196d016c..4c9e40a063 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -6,8 +6,6 @@ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * contrib/neon/pagestore_client.h - * *------------------------------------------------------------------------- */ #ifndef pageserver_h @@ -187,7 +185,7 @@ extern char *nm_to_string(NeonMessage *msg); * API */ -typedef unsigned shardno_t; +typedef uint16 shardno_t; typedef struct { @@ -211,7 +209,7 @@ extern int neon_protocol_version; extern shardno_t get_shard_number(BufferTag* tag); -extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -233,8 +231,13 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbuffers, bool skipFsync); #endif +#if PG_MAJORVERSION_NUM >=17 +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif /* * LSN values associated with each request to the pageserver @@ -269,19 +272,11 @@ typedef struct } neon_request_lsns; #if PG_MAJORVERSION_NUM < 16 -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, char *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); #else -extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer); -extern void neon_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); #endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); @@ -299,17 +294,34 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -#if PG_MAJORVERSION_NUM < 16 -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - char *buffer); -#else -extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer); -#endif -extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 7f39c7d026..36538ea5e2 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,6 +58,7 @@ #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" +#include "port/pg_iovec.h" #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/buf_internals.h" @@ -66,6 +67,7 @@ #include "storage/smgr.h" #include "pagestore_client.h" +#include "bitmap.h" #if PG_VERSION_NUM >= 150000 #include "access/xlogrecovery.h" @@ -170,16 +172,28 @@ typedef enum PrefetchStatus * valid */ } PrefetchStatus; +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_SEQ = 0x1, +} PrefetchRequestFlags; + typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ - PrefetchStatus status; - shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; +StaticAssertDecl(sizeof(PrefetchRequest) == 64, + "We prefer to have a power-of-2 size for this struct. Please" + " try to find an alternative solution before reaching to" + " increase the expected size here"); + /* prefetch buffer lookup hash table */ typedef struct PrfHashEntry @@ -251,17 +265,17 @@ typedef struct PrefetchState PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; -#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) -#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) -#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) - static PrefetchState *MyPState; +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + #define GetPrfSlot(ring_index) ( \ ( \ AssertMacro((ring_index) < MyPState->ring_unused && \ (ring_index) >= MyPState->ring_last), \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + GetPrfSlotNoCheck(ring_index) \ ) \ ) @@ -281,9 +295,17 @@ static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns); +#endif -static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); -static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks, const bits8 *mask); +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot); static bool @@ -729,9 +751,9 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns if (force_request_lsns) slot->request_lsns = *force_request_lsns; else - slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum); + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1, NULL); request.req.lsn = slot->request_lsns.request_lsn; request.req.not_modified_since = slot->request_lsns.not_modified_since; @@ -771,141 +793,194 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns */ static uint64 -prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) { - uint64 ring_index; + uint64 min_ring_index; PrefetchRequest req; - PrefetchRequest *slot; - PrfHashEntry *entry; +#if USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); /* use an intermediate PrefetchRequest struct to ensure correct alignment */ req.buftag = tag; + Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); - - if (entry != NULL) + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; - /* - * If the caller specified a request LSN to use, only accept prefetch - * responses that satisfy that request. - */ - if (force_request_lsns) - { - if (!neon_prefetch_response_usable(*force_request_lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - } - } +#if USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + req.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); if (entry != NULL) { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag)); + /* - * We received a prefetch for a page that was recently read and - * removed from the buffers. Remove that request from the buffers. + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. */ - if (slot->status == PRFS_TAG_REMAINS) + if (lsns) { - prefetch_set_unused(ring_index); - entry = NULL; + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); } else { - /* The buffered request is good enough, return that index */ - pgBufferUsage.prefetch.duplicates++; - return ring_index; + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } - } - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page unnecessarily - * in that case. If the oldest slot holds a request that we haven't - * received a response for yet, we have to wait for the response to that - * before we can continue. We might not have even flushed the request to - * the pageserver yet, it might be just sitting in the output buffer. In - * that case, we flush it and wait for the response. (We could decide not - * to send it, but it's hard to abort when the request is already in the - * output buffer, and 'not sending' a prefetch request kind of goes - * against the principles of prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); - } - } + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = req.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + + min_ring_index = Min(min_ring_index, ring_index); + + prefetch_do_request(slot, lsns); } - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + Assert(any_hits); - Assert(MyPState->ring_last <= ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = tag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - - prefetch_do_request(slot, force_request_lsns); - Assert(slot->status == PRFS_REQUESTED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) @@ -921,9 +996,17 @@ Retry: MyPState->ring_flush = MyPState->ring_unused; } - return ring_index; + return min_ring_index; } + +static uint64 +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) +{ + return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL); +} + + /* * Note: this function can get canceled and use a long jump to the next catch * context. Take care. @@ -1348,6 +1431,50 @@ log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } +#if PG_MAJORVERSION_NUM >= 17 +/* + * Wrapper around log_newpages() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, + BlockNumber nblocks, Page *pages, bool page_std) +{ + PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pageptrs[XLR_MAX_BLOCK_ID]; + int nregistered = 0; + XLogRecPtr result = 0; + + for (int i = 0; i < nblocks; i++) + { + Page page = copied_buffer[nregistered].data; + memcpy(page, pages[i], BLCKSZ); + pageptrs[nregistered] = page; + blknos[nregistered] = blkno + i; + + ++nregistered; + + if (nregistered >= XLR_MAX_BLOCK_ID) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + nregistered = 0; + } + } + + if (nregistered != 0) + { + log_newpages(rinfo, forkNum, nregistered, blknos, pageptrs, + page_std); + } + + return ProcLastRecPtr; +} +#endif /* PG_MAJORVERSION_NUM >= 17 */ + /* * Is 'buffer' identical to a freshly initialized empty heap page? */ @@ -1361,14 +1488,160 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +#if PG_MAJORVERSION_NUM >= 17 +static void +neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, const char **buffers, bool force) +{ +#define BLOCK_BATCH_SIZE 16 + bool log_pages; + BlockNumber batch_blockno = blocknum; + XLogRecPtr lsns[BLOCK_BATCH_SIZE]; + int batch_size = 0; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + log_pages = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_pages = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_pages = true; + } + + if (log_pages) + { + XLogRecPtr recptr; + recptr = log_newpages_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + nblocks, (Page *) buffers, false); + + for (int i = 0; i < nblocks; i++) + PageSetLSN(unconstify(char *, buffers[i]), recptr); + + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u through %u of relation %u/%u/%u.%u " + "were force logged, lsn=%X/%X", + blocknum, blocknum + nblocks, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(recptr)))); + } + + for (int i = 0; i < nblocks; i++) + { + Page page = (Page) buffers[i]; + BlockNumber blkno = blocknum + i; + XLogRecPtr lsn = PageGetLSN(page); + + if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (PageIsEmptyHeapPage(page)) + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + } + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) + { + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ + } + } + else + { + ereport(SmgrTrace, + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + lsns[batch_size++] = lsn; + + if (batch_size >= BLOCK_BATCH_SIZE) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + batch_blockno += batch_size; + batch_size = 0; + } + } + + if (batch_size != 0) + { + SetLastWrittenLSNForBlockv(lsns, InfoFromSMgrRel(reln), forknum, + batch_blockno, + batch_size); + } +} +#endif + /* * A page is being evicted from the shared buffer cache. Update the * last-written LSN of the page, and WAL-log it if needed. */ -static void #if PG_MAJORVERSION_NUM < 16 +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) #else +static void neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force) #endif { @@ -1548,18 +1821,39 @@ nm_adjust_lsn(XLogRecPtr lsn) return lsn; } + +/* + * Since PG17 we use vetorized version, + * so add compatibility function for older versions + */ +#if PG_MAJORVERSION_NUM < 17 +static void +GetLastWrittenLSNv(NRelFileInfo relfilenode, ForkNumber forknum, + BlockNumber blkno, int nblocks, XLogRecPtr *lsns) +{ + lsns[0] = GetLastWrittenLSN(relfilenode, forknum, blkno); +} +#endif + /* * Return LSN for requesting pages and number of blocks from page server */ -static neon_request_lsns -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static void +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + neon_request_lsns *output, BlockNumber nblocks, + const bits8 *mask) { - XLogRecPtr last_written_lsn; - neon_request_lsns result; + XLogRecPtr last_written_lsns[PG_IOV_MAX]; - last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - last_written_lsn = nm_adjust_lsn(last_written_lsn); - Assert(last_written_lsn != InvalidXLogRecPtr); + Assert(nblocks <= PG_IOV_MAX); + + GetLastWrittenLSNv(rinfo, forknum, blkno, (int) nblocks, last_written_lsns); + + for (int i = 0; i < nblocks; i++) + { + last_written_lsns[i] = nm_adjust_lsn(last_written_lsns[i]); + Assert(last_written_lsns[i] != InvalidXLogRecPtr); + } if (RecoveryInProgress()) { @@ -1630,95 +1924,111 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) /* Request the page at the end of the last fully replayed LSN. */ XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - if (last_written_lsn > replay_lsn) + for (int i = 0; i < nblocks; i++) { - /* GetCurrentReplayRecPtr was introduced in v15 */ + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ #if PG_VERSION_NUM >= 150000 - Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); #endif - /* - * Cases 2 and 4. If this is a backend (case 4), the - * neon_read_at_lsn() call later will wait for the WAL record to be - * fully replayed. - */ - result.request_lsn = last_written_lsn; - } - else - { - /* cases 1 and 3 */ - result.request_lsn = replay_lsn; - } - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = result.request_lsn; - Assert(last_written_lsn <= result.request_lsn); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result->request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result->request_lsn = replay_lsn; + } - neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", - LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = result->request_lsn; + Assert(last_written_lsn <= result->request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result->request_lsn), LSN_FORMAT_ARGS(result->not_modified_since)); + } } else { XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache as the - * 'not_modified_since' hint. Any pages modified by later WAL records - * must still in the buffer cache, so our request cannot concern - * those. - */ - neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", - LSN_FORMAT_ARGS(last_written_lsn)); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index - * building, _bt_blwritepage logs the full page without flushing WAL - * before smgrextend (files are fsynced before build ends). - */ #if PG_VERSION_NUM >= 150000 flushlsn = GetFlushRecPtr(NULL); #else flushlsn = GetFlushRecPtr(); #endif - if (last_written_lsn > flushlsn) + + for (int i = 0; i < nblocks; i++) { - neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - LSN_FORMAT_ARGS(last_written_lsn), - LSN_FORMAT_ARGS(flushlsn)); - XLogFlush(last_written_lsn); - flushlsn = last_written_lsn; + neon_request_lsns *result = &output[i]; + XLogRecPtr last_written_lsn = last_written_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + /* + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. + */ + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ + if (last_written_lsn > flushlsn) + { + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; + } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result->request_lsn = UINT64_MAX; + result->not_modified_since = last_written_lsn; + result->effective_request_lsn = flushlsn; } - - /* - * Request the very latest version of the page. In principle we - * want to read the page at the current insert LSN, and we could - * use that value in the request. However, there's a corner case - * with pageserver's garbage collection. If the GC horizon is - * set to a very small value, it's possible that by the time - * that the pageserver processes our request, the GC horizon has - * already moved past the LSN we calculate here. Standby servers - * always have that problem as the can always lag behind the - * primary, but for the primary we can avoid it by always - * requesting the latest page, by setting request LSN to - * UINT64_MAX. - * - * Remember the current LSN, however, so that we can later - * correctly determine if the response to the request is still - * valid. The most up-to-date LSN we could use for that purpose - * would be the current insert LSN, but to avoid the overhead of - * looking it up, use 'flushlsn' instead. This relies on the - * assumption that if the page was modified since the last WAL - * flush, it should still be in the buffer cache, and we - * wouldn't be requesting it. - */ - result.request_lsn = UINT64_MAX; - result.not_modified_since = last_written_lsn; - result.effective_request_lsn = flushlsn; } - - return result; } /* @@ -1728,13 +2038,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) * satisfy a page read now. */ static bool -neon_prefetch_response_usable(neon_request_lsns request_lsns, +neon_prefetch_response_usable(neon_request_lsns *request_lsns, PrefetchRequest *slot) { /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); - Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); @@ -1755,15 +2065,15 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, * calculate LSNs "out of order" with each other, but the prefetch queue * is backend-private at the moment.) */ - if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) { ereport(LOG, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "request with unexpected LSN after prefetch"), errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); return false; @@ -1817,9 +2127,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns, */ /* this follows from the checks above */ - Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1886,7 +2196,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, @@ -2068,7 +2379,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, */ if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2149,7 +2460,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) + !AmAutoVacuumWorkerProcess()) { uint64 current_size = GetNeonCurrentClusterSize(); @@ -2247,6 +2558,73 @@ neon_close(SMgrRelation reln, ForkNumber forknum) } +#if PG_MAJORVERSION_NUM >= 17 +/* + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + BufferTag tag; + bool io_initiated = false; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum, nblocks); + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + tag.spcOid = reln->smgr_rlocator.locator.spcOid; + tag.dbOid = reln->smgr_rlocator.locator.dbOid; + tag.relNumber = reln->smgr_rlocator.locator.relNumber; + tag.forkNum = forknum; + + while (nblocks > 0) + { + int iterblocks = Min(nblocks, PG_IOV_MAX); + int seqlen = 0; + bits8 lfc_present[PG_IOV_MAX / 8]; + memset(lfc_present, 0, sizeof(lfc_present)); + + if (lfc_cache_containsv(InfoFromSMgrRel(reln), forknum, blocknum, + iterblocks, lfc_present) == iterblocks) + { + nblocks -= iterblocks; + blocknum += iterblocks; + continue; + } + + io_initiated = true; + + tag.blockNum = blocknum; + + for (int i = 0; i < PG_IOV_MAX / 8; i++) + lfc_present[i] = ~(lfc_present[i]); + + ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, + lfc_present); + nblocks -= iterblocks; + blocknum += iterblocks; + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + } + + return false; +} + + +#else /* PG_MAJORVERSION_NUM >= 17 */ /* * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -2285,6 +2663,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return false; } +#endif /* PG_MAJORVERSION_NUM < 17 */ + /* * neon_writeback() -- Tell the kernel to write pages back to storage. @@ -2315,7 +2695,12 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - /* not implemented */ + /* + * TODO: WAL sync up to lwLsn for the indicated blocks + * Without that sync, writeback doesn't actually guarantee the data is + * persistently written, which does seem to be one of the assumed + * properties of this smgr API call. + */ neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL @@ -2324,30 +2709,27 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * While function is defined in the neon extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void +static void #if PG_MAJORVERSION_NUM < 16 -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, char *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + char **buffers, BlockNumber nblocks, const bits8 *mask) #else -neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - neon_request_lsns request_lsns, void *buffer) +neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) #endif { NeonResponse *resp; uint64 ring_index; PrfHashEntry *entry; PrefetchRequest *slot; - BufferTag buftag = - { - .forkNum = forkNum, - .blockNum = blkno, - }; + BufferTag buftag = {0}; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); CopyNRelFileInfoToBufTag(buftag, rinfo); + buftag.forkNum = forkNum; + buftag.blockNum = base_blockno; /* * The redo process does not lock pages that it needs to replay but are @@ -2365,115 +2747,147 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * weren't for the behaviour of the LwLsn cache that uses the highest * value of the LwLsn cache when the entry is not found. */ - if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsns.request_lsn); + prefetch_register_bufferv(buftag, request_lsns, nblocks, mask); - /* - * Try to find prefetched page in the list of received pages. - */ + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + + if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i)) + continue; + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns[0].request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ Retry: - entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(request_lsns, slot)) + if (entry != NULL) { - ring_index = slot->my_ring_index; - pgBufferUsage.prefetch.hits += 1; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; + ring_index = slot->my_ring_index; + pgBufferUsage.prefetch.hits += 1; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - /* make it look like a prefetch cache miss */ - entry = NULL; } - } - do - { - if (entry == NULL) + do { - pgBufferUsage.prefetch.misses += 1; + if (entry == NULL) + { + pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_lsns); - slot = GetPrfSlot(ring_index); - } - else + ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rinfo, forkNum, blockno, buffer); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rinfo, forkNum, blkno, buffer); - break; - - case T_NeonErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, blkno, - RelFileInfoFmt(rinfo), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); } /* - * neon_read() -- Read the specified block from a relation. + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. */ void #if PG_MAJORVERSION_NUM < 16 +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, char *buffer) +#else +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + neon_request_lsns request_lsns, void *buffer) +#endif +{ + neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); +} + +#if PG_MAJORVERSION_NUM < 17 +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) #else +void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { @@ -2502,7 +2916,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL); neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -2578,6 +2992,148 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } #endif } +#endif /* PG_MAJORVERSION_NUM <= 16 */ + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) +{ + bits8 read[PG_IOV_MAX / 8]; + neon_request_lsns request_lsns[PG_IOV_MAX]; + int lfc_result; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (nblocks > PG_IOV_MAX) + neon_log(ERROR, "Read request too large: %d is larger than max %d", + nblocks, PG_IOV_MAX); + + memset(read, 0, sizeof(read)); + + /* Try to read from local file cache */ + lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, + nblocks, read); + + /* Read all blocks from LFC, so we're done */ + if (lfc_result == nblocks) + return; + + if (lfc_result == -1) + { + /* can't use the LFC result, so read all blocks from PS */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = 0xFF; + } + else + { + /* invert the result: exclude blocks read from lfc */ + for (int i = 0; i < PG_IOV_MAX / 8; i++) + read[i] = ~(read[i]); + } + + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, + request_lsns, nblocks, read); + + neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + for (int i = 0; i < nblocks; i++) + { +#if PG_MAJORVERSION_NUM >= 17 + mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); +#else + mdread(reln, forkNum, blkno + i, mdbuf); +#endif + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew((Page) mdbuf)) + { + if (!PageIsNew((Page) pageserver_masked)) + { + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew((Page) buffer)) + { + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } + } +#endif +} +#endif #ifdef DEBUG_COMPARE_LOCAL static char * @@ -2623,7 +3179,72 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo if (mdexists(reln, forknum)) { /* It exists locally. Guess it's unlogged then. */ +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif + return; + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_page(reln, forknum, blocknum, buffer, false); + + lsn = PageGetLSN((Page) buffer); + neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + #if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + #else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + #endif +#endif +} + + + +#if PG_MAJORVERSION_NUM >= 17 +void +neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); /* * We could set relpersistence now that we have determined @@ -2641,29 +3262,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); return; - default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - neon_wallog_page(reln, forknum, blocknum, buffer, false); + neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false); - lsn = PageGetLSN((Page) buffer); - neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, blocknum, - (uint32) (lsn >> 32), (uint32) lsn); - - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwrite(reln, forknum, blocknum, buffer, skipFsync); + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); #endif } +#endif + /* * neon_nblocks() -- Get the number of blocks stored in a relation. */ @@ -2699,7 +3315,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, @@ -2757,7 +3375,9 @@ neon_dbsize(Oid dbNode) neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + neon_get_request_lsns(dummy_node, MAIN_FORKNUM, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL); + { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, @@ -2898,6 +3518,38 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) #endif } +#if PG_MAJORVERSION_NUM >= 17 +void +neon_regisersync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrregistersync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdregistersync(reln, forknum); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_log(SmgrTrace, "[NEON_SMGR] registersync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} +#endif + + /* * neon_start_unlogged_build() -- Starting build operation on a rel. * @@ -3047,8 +3699,11 @@ neon_end_unlogged_build(SMgrRelation reln) static int neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - XLogRecPtr request_lsn, - not_modified_since; + XLogRecPtr request_lsn, + not_modified_since; + SlruKind kind; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -3078,32 +3733,30 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf */ not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); - SlruKind kind; - - if (STRPREFIX(path, "pg_xact")) - kind = SLRU_CLOG; - else if (STRPREFIX(path, "pg_multixact/members")) - kind = SLRU_MULTIXACT_MEMBERS; - else if (STRPREFIX(path, "pg_multixact/offsets")) - kind = SLRU_MULTIXACT_OFFSETS; - else - return -1; + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; NeonResponse *resp; NeonGetSlruSegmentRequest request = { .req.tag = T_NeonGetSlruSegmentRequest, .req.lsn = request_lsn, .req.not_modified_since = not_modified_since, - .kind = kind, .segno = segno }; - int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do { while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); } while (resp == NULL); @@ -3182,14 +3835,23 @@ static const struct f_smgr neon_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = neon_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = neon_prefetch, + .smgr_readv = neon_readv, + .smgr_writev = neon_writev, +#else .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, +#endif + .smgr_writeback = neon_writeback, .smgr_nblocks = neon_nblocks, .smgr_truncate = neon_truncate, .smgr_immedsync = neon_immedsync, - +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = neon_regisersync, +#endif .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, @@ -3198,11 +3860,11 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, NRelFileInfo rinfo) +smgr_neon(ProcNumber backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) + if (backend != INVALID_PROC_NUMBER) return smgr_standard(backend, rinfo); else return &neon_smgr; diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 65ef588ba5..4d0d06e6de 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -81,6 +81,7 @@ static void nwp_register_gucs(void); static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); +static uint64 startup_backpressure_wrap(void); static bool backpressure_throttling_impl(void); static void walprop_register_bgworker(void); @@ -90,7 +91,7 @@ static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); static void walprop_pg_load_libpqwalreceiver(void); -static process_interrupts_callback_t PrevProcessInterruptsCallback; +static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL; static shmem_startup_hook_type prev_shmem_startup_hook_type; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -178,7 +179,7 @@ pg_init_walproposer(void) nwp_prepare_shmem(); - delay_backend_us = &backpressure_lag_impl; + delay_backend_us = &startup_backpressure_wrap; PrevProcessInterruptsCallback = ProcessInterruptsCallback; ProcessInterruptsCallback = backpressure_throttling_impl; @@ -352,6 +353,22 @@ backpressure_lag_impl(void) return 0; } +/* + * We don't apply backpressure when we're the postmaster, or the startup + * process, because in postmaster we can't apply backpressure, and in + * the startup process we can't afford to slow down. + */ +static uint64 +startup_backpressure_wrap(void) +{ + if (AmStartupProcess() || !IsUnderPostmaster) + return 0; + + delay_backend_us = &backpressure_lag_impl; + + return backpressure_lag_impl(); +} + /* * WalproposerShmemSize --- report amount of shared memory space needed */ @@ -401,12 +418,13 @@ WalproposerShmemInit_SyncSafekeeper(void) static bool backpressure_throttling_impl(void) { - int64 lag; + uint64 lag; TimestampTz start, stop; - bool retry = PrevProcessInterruptsCallback - ? PrevProcessInterruptsCallback() - : false; + bool retry = false; + + if (PointerIsValid(PrevProcessInterruptsCallback)) + retry = PrevProcessInterruptsCallback(); /* * Don't throttle read only transactions or wal sender. Do throttle CREATE @@ -602,7 +620,12 @@ walprop_pg_init_walsender(void) /* Create replication slot for WAL proposer if not exists */ if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) { +#if PG_MAJORVERSION_NUM >= 17 + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, + false, false, false); +#else ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); +#endif ReplicationSlotReserveWal(); /* Write this slot to disk */ ReplicationSlotMarkDirty(); @@ -1509,7 +1532,11 @@ walprop_pg_init_event_set(WalProposer *wp) wpg_log(FATAL, "double-initialization of event set"); /* for each sk, we have socket plus potentially socket for neon walreader */ +#if PG_MAJORVERSION_NUM >= 17 + waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers); +#else waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers); +#endif AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c index f327e132e9..66032c88f6 100644 --- a/pgxn/neon_rmgr/neon_rmgr_decode.c +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -1,6 +1,7 @@ #include "postgres.h" #if PG_MAJORVERSION_NUM >= 16 + #include "access/heapam_xlog.h" #include "access/neon_xlog.h" #include "replication/decode.h" @@ -9,6 +10,10 @@ #include "neon_rmgr.h" +#endif /* PG >= 16 */ + +#if PG_MAJORVERSION_NUM == 16 + /* individual record(group)'s handlers */ static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); @@ -399,6 +404,398 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) header->t_infomask2 = xlhdr.t_infomask2; header->t_hoff = xlhdr.t_hoff; } +#endif + +#if PG_MAJORVERSION_NUM == 17 + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple); -#endif \ No newline at end of file +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + HeapTuple tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->t_tableOid = InvalidOid; + + tuple->t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, HeapTuple tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->t_len = datalen + SizeofHeapTupleHeader; + header = tuple->t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} +#endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 4e604a710c..a45e8f5c4a 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -68,8 +68,13 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM >= 17 +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); +#else static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#endif #if PG_MAJORVERSION_NUM < 16 static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); @@ -93,7 +98,9 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - +#if PG_MAJORVERSION_NUM >= 17 +static void inmem_registersync(SMgrRelation reln, ForkNumber forknum); +#endif /* * inmem_init() -- Initialize private state @@ -190,6 +197,14 @@ inmem_close(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + return true; +} +#else /* * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation */ @@ -198,6 +213,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { return true; } +#endif /* * inmem_writeback() -- Tell the kernel to write pages back to storage. @@ -211,11 +227,13 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, /* * inmem_read() -- Read the specified block from a relation. */ +#if PG_MAJORVERSION_NUM < 16 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, -#if PG_MAJORVERSION_NUM < 16 char *buffer) #else +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, void *buffer) #endif { @@ -228,6 +246,18 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, memcpy(buffer, page_body[pg], BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + void **buffers, BlockNumber nblocks) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_read(reln, forknum, blkno, buffers[i]); + } +} +#endif + /* * inmem_write() -- Write the supplied block at the appropriate location. * @@ -280,6 +310,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, memcpy(page_body[pg], buffer, BLCKSZ); } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + for (int i = 0; i < nblocks; i++) + { + inmem_write(reln, forknum, blkno, buffers[i], skipFsync); + } +} +#endif + /* * inmem_nblocks() -- Get the number of blocks stored in a relation. */ @@ -315,6 +357,13 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum) { } +#if PG_MAJORVERSION_NUM >= 17 +static void +inmem_registersync(SMgrRelation reln, ForkNumber forknum) +{ +} +#endif + static const struct f_smgr inmem_smgr = { .smgr_init = inmem_init, @@ -328,23 +377,39 @@ static const struct f_smgr inmem_smgr = #if PG_MAJORVERSION_NUM >= 16 .smgr_zeroextend = inmem_zeroextend, #endif +#if PG_MAJORVERSION_NUM >= 17 + .smgr_prefetch = inmem_prefetch, + .smgr_readv = inmem_readv, + .smgr_writev = inmem_writev, +#else .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, +#endif .smgr_writeback = inmem_writeback, .smgr_nblocks = inmem_nblocks, .smgr_truncate = inmem_truncate, .smgr_immedsync = inmem_immedsync, + +#if PG_MAJORVERSION_NUM >= 17 + .smgr_registersync = inmem_registersync, +#endif + + .smgr_start_unlogged_build = NULL, + .smgr_finish_unlogged_build_phase_1 = NULL, + .smgr_end_unlogged_build = NULL, + .smgr_read_slru_segment = NULL, }; const f_smgr * -smgr_inmem(BackendId backend, NRelFileInfo rinfo) +smgr_inmem(ProcNumber backend, NRelFileInfo rinfo) { Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rinfo); - else - return &inmem_smgr; + // // What does this code do? + // if (backend != INVALID_PROC_NUMBER) + // return smgr_standard(backend, rinfo); + // else + return &inmem_smgr; } void diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index 58b98b8e6a..91f1c80965 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); +extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index cc545393f5..219ca85207 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -100,6 +100,9 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#if PG_MAJORVERSION_NUM >= 17 +#include "storage/dsm_registry.h" +#endif #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -137,7 +140,7 @@ static BufferTag target_redo_tag; static XLogReaderState *reader_state; -#define TRACE DEBUG5 +#define TRACE LOG #ifdef HAVE_LIBSECCOMP @@ -517,6 +520,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up xlog, clog, and buffers */ +#if PG_MAJORVERSION_NUM >= 17 + DSMRegistryShmemInit(); + VarsupShmemInit(); +#endif XLOGShmemInit(); CLOGShmemInit(); CommitTsShmemInit(); @@ -566,7 +573,10 @@ CreateFakeSharedMemoryAndSemaphores() /* * Set up other modules that need some shared memory space */ +#if PG_MAJORVERSION_NUM < 17 + /* "snapshot too old" was removed in PG17, and with it the SnapMgr */ SnapMgrInit(); +#endif BTreeShmemInit(); SyncScanShmemInit(); /* Skip due to the 'pg_notify' directory check */ @@ -742,7 +752,7 @@ BeginRedoForBlock(StringInfo input_message) target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py index 064a678c96..d8390138c9 100644 --- a/test_runner/fixtures/common_types.py +++ b/test_runner/fixtures/common_types.py @@ -13,7 +13,7 @@ DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 class Lsn: """ Datatype for an LSN. Internally it is a 64-bit integer, but the string - representation is like "1/123abcd". See also pg_lsn datatype in Postgres + representation is like "1/0123abcd". See also pg_lsn datatype in Postgres """ def __init__(self, x: Union[int, str]): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ee62372871..50284a3f5a 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -933,8 +933,11 @@ class NeonEnvBuilder: for directory_to_clean in reversed(directories_to_clean): if not os.listdir(directory_to_clean): - log.debug(f"Removing empty directory {directory_to_clean}") - directory_to_clean.rmdir() + log.info(f"Removing empty directory {directory_to_clean}") + try: + directory_to_clean.rmdir() + except Exception as e: + log.error(f"Error removing empty directory {directory_to_clean}: {e}") def cleanup_remote_storage(self): for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: @@ -3423,6 +3426,7 @@ class VanillaPostgres(PgProtocol): assert not self.running with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: conf_file.write("\n".join(options)) + conf_file.write("\n") def edit_hba(self, hba: List[str]): """Prepend hba lines into pg_hba.conf file.""" @@ -3476,6 +3480,7 @@ def vanilla_pg( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"]) yield vanilla_pg diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index e12c8e5f4a..258935959b 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -16,6 +16,7 @@ class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" V16 = "16" + V17 = "17" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json new file mode 100644 index 0000000000..7990b2c3a2 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v17/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG17 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b559be5f18..fb5c1d3115 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -21,7 +21,7 @@ from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.workload import Workload @@ -156,6 +156,9 @@ ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_ @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, @@ -203,6 +206,9 @@ def test_backward_compatibility( @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") +@skip_on_postgres( + PgVersion.V17, "There are no snapshots yet" +) # TODO: revert this once we have snapshots def test_forward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 27eb05ac09..7370eb1456 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -44,6 +44,8 @@ def test_remote_extensions( ): if pg_version == PgVersion.V16: pytest.skip("TODO: PG16 extension building") + if pg_version == PgVersion.V17: + pytest.skip("TODO: PG17 extension building") # setup mock http server # that expects request for anon.tar.zst diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py index 03e8c7c0df..4145a303c6 100644 --- a/test_runner/regress/test_postgres_version.py +++ b/test_runner/regress/test_postgres_version.py @@ -20,16 +20,19 @@ def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): output = f.read().strip() # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". - pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + # beta- and release candidate releases would use '17beta1' and '18rc2' instead of .-separated numbers. + pattern = ( + r"postgres \(PostgreSQL\) (?P\d+(?:beta|rc|\.)\d+) \((?P[0-9a-f]{40})\)" + ) match = re.search(pattern, output, re.IGNORECASE) assert match is not None, f"Can't parse {output} with {pattern}" version = match.group("version") commit = match.group("commit") - assert ( - pg_version.v_prefixed in expected_revisions - ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" - - msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" - assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg + if "." in version: + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Released PostgreSQL version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d152d0f41f..f98b53d966 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -118,6 +118,9 @@ def test_ancestor_detach_branched_from( truncated_layers = 0 elif branchpoint == Branchpoint.AFTER_L0: branch_at = Lsn(last_lsn + 8) + # make sure the branch point is not on a page header + if 0 < (branch_at.lsn_int % 8192) < 40: + branch_at += 40 rows = 8192 # as there is no 8 byte walrecord, nothing should get copied from the straddling layer truncated_layers = 0 diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index ea900b07b8..ebe65e7c29 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,19 +1,32 @@ import os +from pathlib import Path +from fixtures.common_types import TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import ( + NeonEnv, + PgBin, + fork_at_current_lsn, + import_timeline_from_vanilla_postgres, +) # # Test branching, when a transaction is in prepared state # -def test_twophase(neon_simple_env: NeonEnv): - env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=5"]) +def twophase_test_on_timeline(env: NeonEnv): + endpoint = env.endpoints.create_start( + "test_twophase", config_lines=["max_prepared_transactions=5"] + ) conn = endpoint.connect() cur = conn.cursor() + # FIXME: Switch to the next WAL segment, to work around the bug fixed in + # https://github.com/neondatabase/neon/pull/8914. When that is merged, this can be + # removed. + cur.execute("select pg_switch_wal()") + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row @@ -53,7 +66,7 @@ def test_twophase(neon_simple_env: NeonEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "main") + fork_at_current_lsn(env, endpoint, "test_twophase_prepared", "test_twophase") # Start compute on the new branch endpoint2 = env.endpoints.create_start( @@ -80,3 +93,50 @@ def test_twophase(neon_simple_env: NeonEnv): # Only one committed insert is visible on the original branch cur.execute("SELECT * FROM foo") assert cur.fetchall() == [("three",)] + + +def test_twophase(neon_simple_env: NeonEnv): + """ + Test branching, when a transaction is in prepared state + """ + env = neon_simple_env + env.neon_cli.create_branch("test_twophase") + + twophase_test_on_timeline(env) + + +def test_twophase_nonzero_epoch( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Same as 'test_twophase' test, but with a non-zero XID epoch, i.e. after 4 billion XIDs + have been consumed. (This is to ensure that we correctly use the full 64-bit XIDs in + pg_twophase filenames with PostgreSQL v17.) + """ + env = neon_simple_env + + # Reset the vanilla Postgres instance with a higher XID epoch + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--epoch=1000000000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + timeline_id = TimelineId.generate() + + # Import the cluster to Neon + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + env.initial_tenant, + timeline_id, + "test_twophase", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() # don't need the original server anymore + + twophase_test_on_timeline(env) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 new file mode 160000 index 0000000000..9156d63ce2 --- /dev/null +++ b/vendor/postgres-v17 @@ -0,0 +1 @@ +Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa