From 83e7e5dbbdab4026bf7ec6c33ec27a5c07ef98b8 Mon Sep 17 00:00:00 2001 From: MMeent Date: Tue, 12 Sep 2023 15:11:32 +0200 Subject: [PATCH] Feat/postgres 16 (#4761) This adds PostgreSQL 16 as a vendored postgresql version, and adapts the code to support this version. The important changes to PostgreSQL 16 compared to the PostgreSQL 15 changeset include the addition of a neon_rmgr instead of altering Postgres's original WAL format. Co-authored-by: Alexander Bayandin Co-authored-by: Heikki Linnakangas --- .dockerignore | 1 + .../actions/run-python-test-set/action.yml | 3 + .github/workflows/build_and_test.yml | 33 +- .github/workflows/neon_extra_builds.yml | 17 +- .gitmodules | 4 + Dockerfile | 4 + Dockerfile.compute-node | 99 +- Makefile | 40 +- README.md | 2 +- compute_tools/src/extension_server.rs | 17 +- control_plane/src/local_env.rs | 16 +- libs/postgres_ffi/README.md | 8 +- libs/postgres_ffi/build.rs | 3 +- libs/postgres_ffi/src/lib.rs | 99 +- libs/postgres_ffi/src/pg_constants.rs | 14 + libs/postgres_ffi/src/pg_constants_v14.rs | 5 + libs/postgres_ffi/src/pg_constants_v15.rs | 10 +- libs/postgres_ffi/src/pg_constants_v16.rs | 18 + libs/postgres_ffi/wal_craft/src/lib.rs | 26 +- libs/utils/scripts/restore_from_wal.sh | 7 +- pageserver/src/basebackup.rs | 21 +- pageserver/src/config.rs | 16 +- pageserver/src/walingest.rs | 407 +++++++- pageserver/src/walrecord.rs | 356 +++++-- pgxn/neon/file_cache.c | 40 +- pgxn/neon/libpagestore.c | 2 +- pgxn/neon/neon.c | 14 +- pgxn/neon/neon_pgversioncompat.h | 112 +++ pgxn/neon/pagestore_client.h | 57 +- pgxn/neon/pagestore_smgr.c | 399 ++++---- pgxn/neon/relsize_cache.c | 22 +- pgxn/neon/walproposer.c | 123 ++- pgxn/neon/walproposer.h | 4 +- pgxn/neon/walproposer_utils.c | 9 +- pgxn/neon_rmgr/Makefile | 19 + pgxn/neon_rmgr/neon_rmgr.c | 886 ++++++++++++++++++ pgxn/neon_rmgr/neon_rmgr.control | 4 + pgxn/neon_rmgr/neon_rmgr.h | 13 + pgxn/neon_rmgr/neon_rmgr_decode.c | 404 ++++++++ pgxn/neon_rmgr/neon_rmgr_desc.c | 181 ++++ pgxn/neon_test_utils/neontest.c | 30 +- pgxn/neon_walredo/inmem_smgr.c | 73 +- pgxn/neon_walredo/inmem_smgr.h | 2 +- pgxn/neon_walredo/walredoproc.c | 92 +- safekeeper/src/wal_storage.rs | 22 +- test_runner/fixtures/neon_fixtures.py | 1 + test_runner/fixtures/pg_version.py | 1 + .../5670669815/v16/ext_index.json | 7 + test_runner/regress/test_compatibility.py | 4 +- .../regress/test_download_extensions.py | 8 +- vendor/postgres-v16 | 1 + vendor/revisions.json | 1 + 52 files changed, 3201 insertions(+), 556 deletions(-) create mode 100644 libs/postgres_ffi/src/pg_constants_v16.rs create mode 100644 pgxn/neon/neon_pgversioncompat.h create mode 100644 pgxn/neon_rmgr/Makefile create mode 100644 pgxn/neon_rmgr/neon_rmgr.c create mode 100644 pgxn/neon_rmgr/neon_rmgr.control create mode 100644 pgxn/neon_rmgr/neon_rmgr.h create mode 100644 pgxn/neon_rmgr/neon_rmgr_decode.c create mode 100644 pgxn/neon_rmgr/neon_rmgr_desc.c create mode 100644 test_runner/regress/data/extension_test/5670669815/v16/ext_index.json create mode 160000 vendor/postgres-v16 diff --git a/.dockerignore b/.dockerignore index 396fba3568..ae0ad8fd77 100644 --- a/.dockerignore +++ b/.dockerignore @@ -19,6 +19,7 @@ !trace/ !vendor/postgres-v14/ !vendor/postgres-v15/ +!vendor/postgres-v16/ !workspace_hack/ !neon_local/ !scripts/ninstall.sh diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 013b446307..8dfa6c465f 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -70,6 +70,9 @@ runs: name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest + # The lack of compatibility snapshot (for example, for the new Postgres version) + # shouldn't fail the whole job. Only relevant test should fail. + skip-if-does-not-exist: true - name: Checkout if: inputs.needs_postgres_source == 'true' diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 898d19e7d4..a2376f6d15 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -212,7 +212,7 @@ jobs: # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 FAILED=false - for postgres in postgres-v14 postgres-v15; do + for postgres in postgres-v14 postgres-v15 postgres-v16; do expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') actual=$(git rev-parse "HEAD:vendor/${postgres}") if [ "${expected}" != "${actual}" ]; then @@ -234,6 +234,10 @@ jobs: id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + # Set some environment variables used by all the steps. # # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. @@ -294,6 +298,13 @@ jobs: path: pg_install/v15 key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v3 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) @@ -302,6 +313,10 @@ jobs: if: steps.cache_pg_15.outputs.cache-hit != 'true' run: mold -run make postgres-v15 -j$(nproc) + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: mold -run make postgres-v16 -j$(nproc) + - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) @@ -385,7 +400,7 @@ jobs: fail-fast: false matrix: build_type: [ debug, release ] - pg_version: [ v14, v15 ] + pg_version: [ v14, v15, v16 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -760,7 +775,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15 ] + version: [ v14, v15, v16 ] defaults: run: shell: sh -eu {0} @@ -814,7 +829,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15 ] + version: [ v14, v15, v16 ] defaults: run: shell: sh -eu {0} @@ -915,6 +930,7 @@ jobs: run: | crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 - name: Add latest tag to images if: | @@ -927,6 +943,8 @@ jobs: crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest + crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - name: Push images to production ECR if: | @@ -939,6 +957,8 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest - name: Configure Docker Hub login run: | @@ -950,6 +970,7 @@ jobs: run: | crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} + crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - name: Push latest tags to Docker Hub if: | @@ -962,6 +983,8 @@ jobs: crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - name: Cleanup ECR folder run: rm -rf ~/.ecr @@ -1119,7 +1142,7 @@ jobs: PREFIX: artifacts/latest run: | # Update compatibility snapshot for the release - for pg_version in v14 v15; do + for pg_version in v14 v15 v16; do for build_type in debug release; do OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index f4e9e27e2b..8a1e4571fd 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -38,7 +38,7 @@ jobs: fetch-depth: 1 - name: Install macOS postgres dependencies - run: brew install flex bison openssl protobuf + run: brew install flex bison openssl protobuf icu4c pkg-config - name: Set pg 14 revision for caching id: pg_v14_rev @@ -48,6 +48,10 @@ jobs: id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + - name: Set pg 16 revision for caching + id: pg_v16_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT + - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v3 @@ -62,6 +66,13 @@ jobs: path: pg_install/v15 key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Cache postgres v16 build + id: cache_pg_16 + uses: actions/cache@v3 + with: + path: pg_install/v16 + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + - name: Set extra env for macOS run: | echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV @@ -85,6 +96,10 @@ jobs: if: steps.cache_pg_15.outputs.cache-hit != 'true' run: make postgres-v15 -j$(nproc) + - name: Build postgres v16 + if: steps.cache_pg_16.outputs.cache-hit != 'true' + run: make postgres-v16 -j$(nproc) + - name: Build neon extensions run: make neon-pg-ext -j$(nproc) diff --git a/.gitmodules b/.gitmodules index 081a404135..1d925674a1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,7 @@ path = vendor/postgres-v15 url = https://github.com/neondatabase/postgres.git branch = REL_15_STABLE_neon +[submodule "vendor/postgres-v16"] + path = vendor/postgres-v16 + url = https://github.com/neondatabase/postgres.git + branch = REL_16_STABLE_neon diff --git a/Dockerfile b/Dockerfile index 1c447b2db9..eb4c4bba25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ WORKDIR /home/nonroot COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 +COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh @@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. @@ -65,6 +67,7 @@ RUN set -e \ && apt install -y \ libreadline-dev \ libseccomp-dev \ + libicu67 \ openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ @@ -81,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ +COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 79bce8c244..dd3d751799 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -371,12 +371,21 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz FROM build-deps AS timescaledb-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +ARG PG_VERSION ENV PATH "/usr/local/pgsql/bin:$PATH" -RUN apt-get update && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + export TIMESCALEDB_VERSION=2.10.1 \ + export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ + ;; \ + *) \ + echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \ + esac && \ + apt-get update && \ apt-get install -y cmake && \ - wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ - echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ + echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ @@ -405,6 +414,10 @@ RUN case "${PG_VERSION}" in \ export PG_HINT_PLAN_VERSION=15_1_5_0 \ export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \ ;; \ + "v16") \ + export PG_HINT_PLAN_VERSION=16_1_6_0 \ + export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \ + ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ ;; \ @@ -551,8 +564,16 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \ - echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + export PG_EMBEDDING_VERSION=0.3.5 \ + export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \ + ;; \ + *) \ + echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ + echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -584,6 +605,10 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre # Layer "rust extensions" # This layer is used to build `pgx` deps # +# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from +# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx +# dependency on all the rust extension that depend on it, too. +# ######################################################################################### FROM build-deps AS rust-extensions-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -598,7 +623,17 @@ USER nonroot WORKDIR /home/nonroot ARG PG_VERSION -RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \ + ;; \ + esac && \ + curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ @@ -615,10 +650,21 @@ USER root ######################################################################################### FROM rust-extensions-build AS pg-jsonschema-pg-build +ARG PG_VERSION # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5 -RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \ echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \ mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -633,12 +679,23 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e ######################################################################################### FROM rust-extensions-build AS pg-graphql-pg-build +ARG PG_VERSION # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch) # Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the # same 1.1 version we've used before. -RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \ echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \ mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ @@ -656,9 +713,20 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367 ######################################################################################### FROM rust-extensions-build AS pg-tiktoken-pg-build +ARG PG_VERSION # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023 -RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \ echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \ mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ cargo pgx install --release && \ @@ -672,8 +740,19 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405 ######################################################################################### FROM rust-extensions-build AS pg-pgx-ulid-build +ARG PG_VERSION -RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ +RUN case "${PG_VERSION}" in \ + "v14" | "v15") \ + ;; \ + "v16") \ + echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ + esac && \ + wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \ echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \ mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ diff --git a/Makefile b/Makefile index 0768b64502..0a1d1f0d43 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin) # It can be configured with OPENSSL_PREFIX variable OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib + PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: @@ -83,6 +84,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: # I'm not sure why it wouldn't work, but this is the only place (apart from # the "build-all-versions" entry points) where direct mention of PostgreSQL # versions is used. +.PHONY: postgres-configure-v16 +postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status .PHONY: postgres-configure-v15 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status .PHONY: postgres-configure-v14 @@ -118,6 +121,10 @@ postgres-clean-%: $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean +.PHONY: postgres-check-% +postgres-check-%: postgres-% + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check + .PHONY: neon-pg-ext-% neon-pg-ext-%: postgres-% +@echo "Compiling neon $*" @@ -130,6 +137,11 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install + +@echo "Compiling neon_rmgr $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install +@echo "Compiling neon_test_utils $*" mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ @@ -140,6 +152,13 @@ neon-pg-ext-%: postgres-% $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install + +# pg_embedding was temporarily released as hnsw from this repo, when we only +# supported PostgreSQL 14 and 15 +neon-pg-ext-v14: neon-pg-ext-hnsw-v14 +neon-pg-ext-v15: neon-pg-ext-hnsw-v15 + +neon-pg-ext-hnsw-%: postgres-headers-% postgres-% +@echo "Compiling hnsw $*" mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ @@ -167,28 +186,39 @@ neon-pg-ext-clean-%: .PHONY: neon-pg-ext neon-pg-ext: \ neon-pg-ext-v14 \ - neon-pg-ext-v15 + neon-pg-ext-v15 \ + neon-pg-ext-v16 .PHONY: neon-pg-ext-clean neon-pg-ext-clean: \ neon-pg-ext-clean-v14 \ - neon-pg-ext-clean-v15 + neon-pg-ext-clean-v15 \ + neon-pg-ext-clean-v16 # shorthand to build all Postgres versions .PHONY: postgres postgres: \ postgres-v14 \ - postgres-v15 + postgres-v15 \ + postgres-v16 .PHONY: postgres-headers postgres-headers: \ postgres-headers-v14 \ - postgres-headers-v15 + postgres-headers-v15 \ + postgres-headers-v16 .PHONY: postgres-clean postgres-clean: \ postgres-clean-v14 \ - postgres-clean-v15 + postgres-clean-v15 \ + postgres-clean-v16 + +.PHONY: postgres-check +postgres-check: \ + postgres-check-v14 \ + postgres-check-v15 \ + postgres-check-v16 # This doesn't remove the effects of 'configure'. .PHONY: clean diff --git a/README.md b/README.md index d948a92062..0d8c3f5c99 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf openssl flex bison +brew install protobuf openssl flex bison icu4c pkg-config # add openssl to PATH, required for ed25519 keys generation in neon_local echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index bf32f4a517..3d7ed8c360 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -107,19 +107,25 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String { pub fn get_pg_version(pgbin: &str) -> String { // pg_config --version returns a (platform specific) human readable string - // such as "PostgreSQL 15.4". We parse this to v14/v15 + // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); return parse_pg_version(&human_version).to_string(); } fn parse_pg_version(human_version: &str) -> &str { - match Regex::new(r"(?\d+)\.(?\d+)") + // Normal releases have version strings like "PostgreSQL 15.4". But there + // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL + // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version + // configure option, you can tack any string to the version number, + // e.g. "PostgreSQL 15.4foobar". + match Regex::new(r"^PostgreSQL (?\d+).+") .unwrap() .captures(human_version) { - Some(captures) if captures.len() == 3 => match &captures["major"] { + Some(captures) if captures.len() == 2 => match &captures["major"] { "14" => return "v14", "15" => return "v15", + "16" => return "v16", _ => {} }, _ => {} @@ -146,6 +152,11 @@ mod tests { parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"), "v14" ); + + assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16"); + assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16"); } #[test] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b4d09b01ab..45a7469787 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -182,26 +182,18 @@ impl LocalEnv { pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match pg_version { - 14 => Ok(path.join(format!("v{pg_version}"))), - 15 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } pub fn pageserver_bin(&self) -> PathBuf { diff --git a/libs/postgres_ffi/README.md b/libs/postgres_ffi/README.md index de046eb3da..ae949d2da6 100644 --- a/libs/postgres_ffi/README.md +++ b/libs/postgres_ffi/README.md @@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change in each major PostgreSQL version. Currently, this module supports -PostgreSQL v14 and v15: bindings and code that depends on them are version-specific. -This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15` -Version independend code is explicitly exported into shared `postgres_ffi`. +PostgreSQL v14, v15 and v16: bindings and code that depends on them are +version-specific. +This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and +`postgres_ffi::v16`. Version independent code is explicitly exported into +shared `postgres_ffi`. TODO: Currently, there is also some code that deals with WAL records diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index f7e39751ef..8e6761d6d3 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> { PathBuf::from("pg_install") }; - for pg_version in &["v14", "v15"] { + for pg_version in &["v14", "v15", "v16"] { let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); if pg_install_dir_versioned.is_relative() { let cwd = env::current_dir().context("Failed to get current_dir")?; @@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> { .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") .allowlist_type("PageHeaderData") .allowlist_type("DBState") + .allowlist_type("RelMapFile") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index cc115664d5..c9e5df9f04 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -51,11 +51,59 @@ macro_rules! for_all_postgres_versions { ($macro:tt) => { $macro!(v14); $macro!(v15); + $macro!(v16); }; } for_all_postgres_versions! { postgres_ffi } +/// dispatch_pgversion +/// +/// Run a code block in a context where the postgres_ffi bindings for a +/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv +/// identifier. +/// If the provided pg_version is not supported, we panic!(), unless the +/// optional third argument was provided (in which case that code will provide +/// the default handling instead). +/// +/// Use like +/// +/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE }) +/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE) +/// +/// Other uses are for macro-internal purposes only and strictly unsupported. +/// +#[macro_export] +macro_rules! dispatch_pgversion { + ($version:expr, $code:expr) => { + dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version)) + }; + ($version:expr, $code:expr, $invalid_pgver_handling:expr) => { + dispatch_pgversion!( + $version => $code, + default = $invalid_pgver_handling, + pgversions = [ + 14 : v14, + 15 : v15, + 16 : v16, + ] + ) + }; + ($pgversion:expr => $code:expr, + default = $default:expr, + pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => { + match ($pgversion) { + $($sv => { + use $crate::$vsv as pgv; + $code + },)+ + _ => { + $default + } + } + }; +} + pub mod pg_constants; pub mod relfile_utils; @@ -90,13 +138,7 @@ pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { - match version { - 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), - 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 - || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), - _ => anyhow::bail!("Unknown version {}", version), - } + dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info))) } pub fn generate_wal_segment( @@ -107,11 +149,11 @@ pub fn generate_wal_segment( ) -> Result { assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); - match pg_version { - 14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn), - 15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn), - _ => Err(SerializeError::BadInput), - } + dispatch_pgversion!( + pg_version, + pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn), + Err(SerializeError::BadInput) + ) } pub fn generate_pg_control( @@ -120,11 +162,11 @@ pub fn generate_pg_control( lsn: Lsn, pg_version: u32, ) -> anyhow::Result<(Bytes, u64)> { - match pg_version { - 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), - 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), - _ => anyhow::bail!("Unknown version {}", pg_version), - } + dispatch_pgversion!( + pg_version, + pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + anyhow::bail!("Unknown version {}", pg_version) + ) } // PG timeline is always 1, changing it doesn't have any useful meaning in Neon. @@ -196,8 +238,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { } pub mod waldecoder { - - use crate::{v14, v15}; use bytes::{Buf, Bytes, BytesMut}; use std::num::NonZeroU32; use thiserror::Error; @@ -248,22 +288,17 @@ pub mod waldecoder { } pub fn poll_decode(&mut self) -> Result, WalDecodeError> { - match self.pg_version { - // This is a trick to support both versions simultaneously. - // See WalStreamDecoderHandler comments. - 14 => { - use self::v14::waldecoder_handler::WalStreamDecoderHandler; + dispatch_pgversion!( + self.pg_version, + { + use pgv::waldecoder_handler::WalStreamDecoderHandler; self.poll_decode_internal() - } - 15 => { - use self::v15::waldecoder_handler::WalStreamDecoderHandler; - self.poll_decode_internal() - } - _ => Err(WalDecodeError { + }, + Err(WalDecodeError { msg: format!("Unknown version {}", self.pg_version), lsn: self.lsn, - }), - } + }) + ) } } } diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 9c39b46cc1..1d196c3fe7 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -163,6 +163,20 @@ pub const RM_HEAP2_ID: u8 = 9; pub const RM_HEAP_ID: u8 = 10; pub const RM_LOGICALMSG_ID: u8 = 21; +// from neon_rmgr.h +pub const RM_NEON_ID: u8 = 134; + +pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80; + +pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00; +pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10; +pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20; +pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30; +pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40; +pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50; + +pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40; + // from xlogreader.h pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index 810898ee80..32f8f51114 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10; pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ +pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0 +} diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 6fa5eb008c..626a23c7ea 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -1,10 +1,18 @@ pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; -pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; pub const XLOG_DBASE_DROP: u8 = 0x20; pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs new file mode 100644 index 0000000000..587be71cb3 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -0,0 +1,18 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ + +pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */ + +pub fn bkpimg_is_compressed(bimg_info: u8) -> bool { + const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD; + + (bimg_info & ANY_COMPRESS_FLAG) != 0 +} diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index d4aed88048..fb627ca258 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -49,9 +49,9 @@ impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match self.pg_version { - 14 => Ok(path.join(format!("v{}", self.pg_version))), - 15 => Ok(path.join(format!("v{}", self.pg_version))), + 14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))), _ => bail!("Unsupported postgres version: {}", self.pg_version), } } @@ -250,11 +250,18 @@ fn craft_internal( let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; let last_lsn = match last_lsn { None => client.pg_current_wal_insert_lsn()?, - Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { - Ordering::Less => bail!("Some records were inserted after the crafted WAL"), - Ordering::Equal => last_lsn, - Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), - }, + Some(last_lsn) => { + let insert_lsn = client.pg_current_wal_insert_lsn()?; + match last_lsn.cmp(&insert_lsn) { + Ordering::Less => bail!( + "Some records were inserted after the crafted WAL: {} vs {}", + last_lsn, + insert_lsn + ), + Ordering::Equal => last_lsn, + Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), + } + } }; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); @@ -363,8 +370,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { ); ensure!( u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}", - after_xlog_switch + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", + after_xlog_switch, + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ ); Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) } diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 92cd164b7d..316ec8ed0d 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -9,11 +9,12 @@ PORT=$4 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) rm -fr "$DATA_DIR" env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" -echo port="$PORT" >> "$DATA_DIR"/postgresql.conf +echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf +echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) declare -i WAL_SIZE=$REDO_POS+114 -"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start -"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start +"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate cp "$DATA_DIR"/pg_wal/000000010000000000000001 . cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/ for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index d2dc759835..a959f1cddc 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -25,6 +25,7 @@ use crate::context::RequestContext; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; @@ -323,14 +324,25 @@ where .timeline .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx) .await?; - ensure!(img.len() == 512); + + ensure!( + img.len() + == dispatch_pgversion!( + self.timeline.pg_version, + pgv::bindings::SIZEOF_RELMAPFILE + ) + ); + Some(img) } else { None }; if spcnode == GLOBALTABLESPACE_OID { - let pg_version_str = self.timeline.pg_version.to_string(); + let pg_version_str = match self.timeline.pg_version { + 14 | 15 => self.timeline.pg_version.to_string(), + ver => format!("{ver}\x0A"), + }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; self.ar.append(&header, pg_version_str.as_bytes()).await?; @@ -374,7 +386,10 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); - let pg_version_str = self.timeline.pg_version.to_string(); + let pg_version_str = match self.timeline.pg_version { + 14 | 15 => self.timeline.pg_version.to_string(), + ver => format!("{ver}\x0A"), + }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; self.ar.append(&header, pg_version_str.as_bytes()).await?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 89ba3b6310..dbebde26bd 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -668,26 +668,18 @@ impl PageServerConf { pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); + #[allow(clippy::manual_range_patterns)] match pg_version { - 14 => Ok(path.join(format!("v{pg_version}"))), - 15 => Ok(path.join(format!("v{pg_version}"))), + 14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))), _ => bail!("Unsupported postgres version: {}", pg_version), } } pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - match pg_version { - 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 340b75877d..9192af0ee8 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; @@ -106,6 +106,10 @@ impl<'a> WalIngest<'a> { self.ingest_heapam_record(&mut buf, modification, decoded, ctx) .await?; } + if decoded.xl_rmid == pg_constants::RM_NEON_ID { + self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + .await?; + } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) @@ -172,6 +176,32 @@ impl<'a> WalIngest<'a> { .await?; } } + } else if self.timeline.pg_version == 16 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb, ctx) + .await?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v16::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification + .drop_dbdir(tablespace_id, dropdb.db_id, ctx) + .await?; + } + } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet"); @@ -414,57 +444,346 @@ impl<'a> WalIngest<'a> { // need to clear the corresponding bits in the visibility map. let mut new_heap_blkno: Option = None; let mut old_heap_blkno: Option = None; - if decoded.xl_rmid == pg_constants::RM_HEAP_ID { - let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - if info == pg_constants::XLOG_HEAP_INSERT { - let xlrec = XlHeapInsert::decode(buf); - assert_eq!(0, buf.remaining()); - if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); - } - } else if info == pg_constants::XLOG_HEAP_DELETE { - let xlrec = XlHeapDelete::decode(buf); - assert_eq!(0, buf.remaining()); - if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); - } - } else if info == pg_constants::XLOG_HEAP_UPDATE - || info == pg_constants::XLOG_HEAP_HOT_UPDATE - { - let xlrec = XlHeapUpdate::decode(buf); - // the size of tuple data is inferred from the size of the record. - // we can't validate the remaining number of bytes without parsing - // the tuple data. - if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { - old_heap_blkno = Some(decoded.blocks[0].blkno); - } - if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { - // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a - // non-HOT update where the new tuple goes to different page than - // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is - // set. - new_heap_blkno = Some(decoded.blocks[1].blkno); + + match self.timeline.pg_version { + 14 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v14::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v14::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v14::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[1].blkno); + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v14::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { - let xlrec = XlHeapMultiInsert::decode(buf); + 15 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { - // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set - 0 + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v15::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v15::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v15::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[1].blkno); + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v15::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } } else { - std::mem::size_of::() * xlrec.ntuples as usize - }; - assert_eq!(offset_array_len, buf.remaining()); + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } + 16 => { + if decoded.xl_rmid == pg_constants::RM_HEAP_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; - if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { - new_heap_blkno = Some(decoded.blocks[0].blkno); + if info == pg_constants::XLOG_HEAP_INSERT { + let xlrec = v16::XlHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_DELETE { + let xlrec = v16::XlHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } else if info == pg_constants::XLOG_HEAP_UPDATE + || info == pg_constants::XLOG_HEAP_HOT_UPDATE + { + let xlrec = v16::XlHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[1].blkno); + } + } + } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + if info == pg_constants::XLOG_HEAP2_MULTI_INSERT { + let xlrec = v16::XlHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + } else { + bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); + } + } + _ => {} + } + // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED? + + // Clear the VM bits if required. + if new_heap_blkno.is_some() || old_heap_blkno.is_some() { + let vm_rel = RelTag { + forknum: VISIBILITYMAP_FORKNUM, + spcnode: decoded.blocks[0].rnode_spcnode, + dbnode: decoded.blocks[0].rnode_dbnode, + relnode: decoded.blocks[0].rnode_relnode, + }; + + let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + + // Sometimes, Postgres seems to create heap WAL records with the + // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is + // not set. In fact, it's possible that the VM page does not exist at all. + // In that case, we don't want to store a record to clear the VM bit; + // replaying it would fail to find the previous image of the page, because + // it doesn't exist. So check if the VM page(s) exist, and skip the WAL + // record if it doesn't. + let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?; + if let Some(blknum) = new_vm_blk { + if blknum >= vm_size { + new_vm_blk = None; + } + } + if let Some(blknum) = old_vm_blk { + if blknum >= vm_size { + old_vm_blk = None; + } + } + + if new_vm_blk.is_some() || old_vm_blk.is_some() { + if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the + // new page, both of which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + ctx, + ) + .await?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on + // different VM pages. + if let Some(new_vm_blk) = new_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno: None, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + ctx, + ) + .await?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, + old_heap_blkno, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + ctx, + ) + .await?; + } } } } - // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED? + + Ok(()) + } + + async fn ingest_neonrmgr_record( + &mut self, + buf: &mut Bytes, + modification: &mut DatadirModification<'_>, + decoded: &mut DecodedWALRecord, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + // Handle VM bit updates that are implicitly part of heap records. + + // First, look at the record to determine which VM bits need + // to be cleared. If either of these variables is set, we + // need to clear the corresponding bits in the visibility map. + let mut new_heap_blkno: Option = None; + let mut old_heap_blkno: Option = None; + assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); + + match self.timeline.pg_version { + 16 => { + let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; + + match info { + pg_constants::XLOG_NEON_HEAP_INSERT => { + let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_DELETE => { + let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf); + assert_eq!(0, buf.remaining()); + if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_UPDATE + | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => { + let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf); + // the size of tuple data is inferred from the size of the record. + // we can't validate the remaining number of bytes without parsing + // the tuple data. + if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 { + old_heap_blkno = Some(decoded.blocks[0].blkno); + } + if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 { + // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a + // non-HOT update where the new tuple goes to different page than + // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is + // set. + new_heap_blkno = Some(decoded.blocks[1].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => { + let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf); + + let offset_array_len = + if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 { + // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set + 0 + } else { + std::mem::size_of::() * xlrec.ntuples as usize + }; + assert_eq!(offset_array_len, buf.remaining()); + + if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 { + new_heap_blkno = Some(decoded.blocks[0].blkno); + } + } + pg_constants::XLOG_NEON_HEAP_LOCK => { + /* XLOG_NEON_HEAP_LOCK doesn't need special care */ + } + info => bail!("Unknown WAL record type for Neon RMGR: {}", info), + } + } + _ => bail!( + "Neon RMGR has no known compatibility with PostgreSQL version {}", + self.timeline.pg_version + ), + } + + // FIXME: What about XLOG_NEON_HEAP_LOCK? // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 1a34168fed..27d73fb46d 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -4,9 +4,10 @@ use anyhow::Result; use bytes::{Buf, Bytes}; +use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; -use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; +use postgres_ffi::{BlockNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; @@ -76,9 +77,12 @@ pub struct DecodedBkpBlock { pub flags: u8, /* Information on full-page image, if any */ - pub has_image: bool, /* has image, even for consistency checking */ - pub apply_image: bool, /* has image that should be restored */ - pub will_init: bool, /* record doesn't need previous page version to apply */ + pub has_image: bool, + /* has image, even for consistency checking */ + pub apply_image: bool, + /* has image that should be restored */ + pub will_init: bool, + /* record doesn't need previous page version to apply */ //char *bkp_image; pub hole_offset: u16, pub hole_length: u16, @@ -134,6 +138,237 @@ impl XlRelmapUpdate { } } +pub mod v14 { + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapInsert { + pub offnum: OffsetNumber, + pub flags: u8, + } + + impl XlHeapInsert { + pub fn decode(buf: &mut Bytes) -> XlHeapInsert { + XlHeapInsert { + offnum: buf.get_u16_le(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapMultiInsert { + pub flags: u8, + pub _padding: u8, + pub ntuples: u16, + } + + impl XlHeapMultiInsert { + pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert { + XlHeapMultiInsert { + flags: buf.get_u8(), + _padding: buf.get_u8(), + ntuples: buf.get_u16_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub _padding: u16, + pub t_cid: u32, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlHeapDelete { + XlHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + _padding: buf.get_u16_le(), + t_cid: buf.get_u32_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { + XlHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } +} + +pub mod v15 { + pub use super::v14::{XlHeapDelete, XlHeapInsert, XlHeapMultiInsert, XlHeapUpdate}; +} + +pub mod v16 { + pub use super::v14::{XlHeapInsert, XlHeapMultiInsert}; + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + pub struct XlHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + } + + impl XlHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlHeapDelete { + XlHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { + XlHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } + + /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ + pub mod rm_neon { + use bytes::{Buf, Bytes}; + use postgres_ffi::{OffsetNumber, TransactionId}; + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapInsert { + pub offnum: OffsetNumber, + pub flags: u8, + } + + impl XlNeonHeapInsert { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapInsert { + XlNeonHeapInsert { + offnum: buf.get_u16_le(), + flags: buf.get_u8(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapMultiInsert { + pub flags: u8, + pub _padding: u8, + pub ntuples: u16, + pub t_cid: u32, + } + + impl XlNeonHeapMultiInsert { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapMultiInsert { + XlNeonHeapMultiInsert { + flags: buf.get_u8(), + _padding: buf.get_u8(), + ntuples: buf.get_u16_le(), + t_cid: buf.get_u32_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapDelete { + pub xmax: TransactionId, + pub offnum: OffsetNumber, + pub infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + } + + impl XlNeonHeapDelete { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapDelete { + XlNeonHeapDelete { + xmax: buf.get_u32_le(), + offnum: buf.get_u16_le(), + infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32_le(), + } + } + } + + #[repr(C)] + #[derive(Debug)] + pub struct XlNeonHeapUpdate { + pub old_xmax: TransactionId, + pub old_offnum: OffsetNumber, + pub old_infobits_set: u8, + pub flags: u8, + pub t_cid: u32, + pub new_xmax: TransactionId, + pub new_offnum: OffsetNumber, + } + + impl XlNeonHeapUpdate { + pub fn decode(buf: &mut Bytes) -> XlNeonHeapUpdate { + XlNeonHeapUpdate { + old_xmax: buf.get_u32_le(), + old_offnum: buf.get_u16_le(), + old_infobits_set: buf.get_u8(), + flags: buf.get_u8(), + t_cid: buf.get_u32(), + new_xmax: buf.get_u32_le(), + new_offnum: buf.get_u16_le(), + } + } + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrCreate { @@ -223,90 +458,6 @@ impl XlDropDatabase { } } -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapInsert { - pub offnum: OffsetNumber, - pub flags: u8, -} - -impl XlHeapInsert { - pub fn decode(buf: &mut Bytes) -> XlHeapInsert { - XlHeapInsert { - offnum: buf.get_u16_le(), - flags: buf.get_u8(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapMultiInsert { - pub flags: u8, - pub _padding: u8, - pub ntuples: u16, -} - -impl XlHeapMultiInsert { - pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert { - XlHeapMultiInsert { - flags: buf.get_u8(), - _padding: buf.get_u8(), - ntuples: buf.get_u16_le(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapDelete { - pub xmax: TransactionId, - pub offnum: OffsetNumber, - pub _padding: u16, - pub t_cid: u32, - pub infobits_set: u8, - pub flags: u8, -} - -impl XlHeapDelete { - pub fn decode(buf: &mut Bytes) -> XlHeapDelete { - XlHeapDelete { - xmax: buf.get_u32_le(), - offnum: buf.get_u16_le(), - _padding: buf.get_u16_le(), - t_cid: buf.get_u32_le(), - infobits_set: buf.get_u8(), - flags: buf.get_u8(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlHeapUpdate { - pub old_xmax: TransactionId, - pub old_offnum: OffsetNumber, - pub old_infobits_set: u8, - pub flags: u8, - pub t_cid: u32, - pub new_xmax: TransactionId, - pub new_offnum: OffsetNumber, -} - -impl XlHeapUpdate { - pub fn decode(buf: &mut Bytes) -> XlHeapUpdate { - XlHeapUpdate { - old_xmax: buf.get_u32_le(), - old_offnum: buf.get_u16_le(), - old_infobits_set: buf.get_u8(), - flags: buf.get_u8(), - t_cid: buf.get_u32(), - new_xmax: buf.get_u32_le(), - new_offnum: buf.get_u16_le(), - } - } -} - /// /// Note: Parsing some fields is missing, because they're not needed. /// @@ -321,9 +472,10 @@ pub struct XlXactParsedRecord { pub xact_time: TimestampTz, pub xinfo: u32, - pub db_id: Oid, /* MyDatabaseId */ - pub ts_id: Oid, /* MyDatabaseTableSpace */ - + pub db_id: Oid, + /* MyDatabaseId */ + pub ts_id: Oid, + /* MyDatabaseTableSpace */ pub subxacts: Vec, pub xnodes: Vec, @@ -455,9 +607,12 @@ impl MultiXactMember { #[repr(C)] #[derive(Debug)] pub struct XlMultiXactCreate { - pub mid: MultiXactId, /* new MultiXact's ID */ - pub moff: MultiXactOffset, /* its starting offset in members file */ - pub nmembers: u32, /* number of member XIDs */ + pub mid: MultiXactId, + /* new MultiXact's ID */ + pub moff: MultiXactOffset, + /* its starting offset in members file */ + pub nmembers: u32, + /* number of member XIDs */ pub members: Vec, } @@ -484,7 +639,8 @@ impl XlMultiXactCreate { pub struct XlMultiXactTruncate { pub oldest_multi_db: Oid, /* to-be-truncated range of multixact offsets */ - pub start_trunc_off: MultiXactId, /* just for completeness' sake */ + pub start_trunc_off: MultiXactId, + /* just for completeness' sake */ pub end_trunc_off: MultiXactId, /* to-be-truncated range of multixact members */ @@ -626,12 +782,10 @@ pub fn decode_wal_record( blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = if pg_version == 14 { - (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 - } else { - assert_eq!(pg_version, 15); - (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 - }; + blk.apply_image = dispatch_pgversion!( + pg_version, + (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0 + ); let blk_img_is_compressed = postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 300c80e276..2c0d5b0c4f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -19,13 +19,16 @@ #include #include "postgres.h" + +#include "neon_pgversioncompat.h" + #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "pagestore_client.h" #include "access/parallel.h" #include "postmaster/bgworker.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/buf_internals.h" #include "storage/latch.h" #include "storage/ipc.h" @@ -405,7 +408,7 @@ lfc_init(void) * Returns true if page is found in local cache. */ bool -lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; @@ -416,7 +419,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return false; - tag.rnode = rnode; + CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); hash = get_hash_value(lfc_hash, &tag); @@ -432,7 +435,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) * Evict a page (if present) from the local file cache */ void -lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) { BufferTag tag; FileCacheEntry* entry; @@ -443,7 +446,9 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ return; - INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1))); + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1)); hash = get_hash_value(lfc_hash, &tag); @@ -501,7 +506,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache. */ bool -lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer) { BufferTag tag; @@ -519,7 +524,7 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (!lfc_ensure_opened()) return false; - tag.rnode = rnode; + CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); hash = get_hash_value(lfc_hash, &tag); @@ -568,8 +573,12 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, * If cache is full then evict some other page. */ void -lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + const void *buffer) +#endif { BufferTag tag; FileCacheEntry* entry; @@ -584,9 +593,11 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, if (!lfc_ensure_opened()) return; - tag.rnode = rnode; tag.forkNum = forkNum; tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + + CopyNRelFileInfoToBufTag(tag, rinfo); + hash = get_hash_value(lfc_hash, &tag); LWLockAcquire(lfc_lock, LW_EXCLUSIVE); @@ -718,8 +729,13 @@ local_cache_pages(PG_FUNCTION_ARGS) tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs", INT8OID, -1, 0); +#if PG_MAJORVERSION_NUM < 16 TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); +#else + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber", + OIDOID, -1, 0); +#endif TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", @@ -770,9 +786,9 @@ local_cache_pages(PG_FUNCTION_ARGS) if (entry->bitmap[i >> 5] & (1 << (i & 31))) { fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i; - fctx->record[n_pages].relfilenode = entry->key.rnode.relNode; - fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode; - fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode; + fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key)); + fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key)); fctx->record[n_pages].forknum = entry->key.forkNum; fctx->record[n_pages].blocknum = entry->key.blockNum + i; fctx->record[n_pages].accesscount = entry->access_count; diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 4fdc7f8c82..c89de11594 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -442,7 +442,7 @@ pg_init_libpagestore(void) "Maximal attempts to reconnect to pages server (with 1 second timeout)", NULL, &max_reconnect_attempts, - 10, 0, INT_MAX, + 60, 0, INT_MAX, PGC_USERSET, 0, NULL, NULL, NULL); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index c7211ea05a..4850b0d6a1 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -33,6 +33,14 @@ void _PG_init(void); void _PG_init(void) { + /* + * Also load 'neon_rmgr'. This makes it unnecessary to list both 'neon' + * and 'neon_rmgr' in shared_preload_libraries. + */ +#if PG_VERSION_NUM >= 160000 + load_file("$libdir/neon_rmgr", false); +#endif + pg_init_libpagestore(); pg_init_walproposer(); @@ -40,9 +48,9 @@ _PG_init(void) pg_init_extension_server(); - // Important: This must happen after other parts of the extension - // are loaded, otherwise any settings to GUCs that were set before - // the extension was loaded will be removed. + // Important: This must happen after other parts of the extension + // are loaded, otherwise any settings to GUCs that were set before + // the extension was loaded will be removed. EmitWarningsOnPlaceholders("neon"); } diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h new file mode 100644 index 0000000000..8db0d5341e --- /dev/null +++ b/pgxn/neon/neon_pgversioncompat.h @@ -0,0 +1,112 @@ +/* + * Compatibility macros to cover up differences between supported PostgreSQL versions, + * to help with compiling the same sources for all of them. + */ + +#ifndef NEON_PGVERSIONCOMPAT_H +#define NEON_PGVERSIONCOMPAT_H + +#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId) + +#define RelFileInfoEquals(a, b) ( \ + NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \ + NInfoGetDbOid(a) == NInfoGetDbOid(b) && \ + NInfoGetRelNumber(a) == NInfoGetRelNumber(b) \ +) + +/* buftag population & RelFileNode/RelFileLocator rework */ +#if PG_MAJORVERSION_NUM < 16 + +#define InitBufferTag(tag, rfn, fn, bn) INIT_BUFFERTAG(*tag, *rfn, fn, bn) + +#define USE_RELFILENODE + +#define RELFILEINFO_HDR "storage/relfilenode.h" + +#define NRelFileInfo RelFileNode +#define NRelFileInfoBackend RelFileNodeBackend +#define NRelFileNumber Oid + +#define InfoFromRelation(rel) (rel)->rd_node +#define InfoFromSMgrRel(srel) (srel)->smgr_rnode.node +#define InfoBFromSMgrRel(srel) (srel)->smgr_rnode +#define InfoFromNInfoB(ninfob) ninfob.node + +#define RelFileInfoFmt(rinfo) \ + (rinfo).spcNode, \ + (rinfo).dbNode, \ + (rinfo).relNode + +#define RelFileInfoBackendFmt(ninfob) \ + (ninfob).backend, \ + (ninfob).node.spcNode, \ + (ninfob).node.dbNode, \ + (ninfob).node.relNode + +#define NInfoGetSpcOid(ninfo) (ninfo).spcNode +#define NInfoGetDbOid(ninfo) (ninfo).dbNode +#define NInfoGetRelNumber(ninfo) (ninfo).relNode + +#define CopyNRelFileInfoToBufTag(tag, rinfo) \ + do { \ + (tag).rnode = (rinfo); \ + } while (false); + +#define BufTagGetNRelFileInfo(tag) tag.rnode + +#define SMgrRelGetRelInfo(reln) \ + (reln->smgr_rnode.node) + +#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers + +#else /* major version >= 16 */ + +#define USE_RELFILELOCATOR + +#define BUFFERTAGS_EQUAL(a, b) BufferTagsEqual(&(a), &(b)) + +#define RELFILEINFO_HDR "storage/relfilelocator.h" + +#define NRelFileInfo RelFileLocator +#define NRelFileInfoBackend RelFileLocatorBackend + +#define InfoFromRelation(rel) (rel)->rd_locator +#define InfoFromSMgrRel(srel) (srel)->smgr_rlocator.locator +#define InfoBFromSMgrRel(srel) (srel)->smgr_rlocator +#define InfoFromNInfoB(ninfob) (ninfob).locator + +#define RelFileInfoFmt(rinfo) \ + (rinfo).spcOid, \ + (rinfo).dbOid, \ + (rinfo).relNumber +#define RelFileInfoBackendFmt(ninfob) \ + (ninfob).backend, \ + (ninfob).locator.spcOid, \ + (ninfob).locator.dbOid, \ + (ninfob).locator.relNumber + +#define NInfoGetSpcOid(ninfo) (ninfo).spcOid +#define NInfoGetDbOid(ninfo) (ninfo).dbOid +#define NInfoGetRelNumber(ninfo) (ninfo).relNumber + +#define CopyNRelFileInfoToBufTag(tag, rinfo) \ + do { \ + (tag).spcOid = (rinfo).spcOid; \ + (tag).dbOid = (rinfo).dbOid; \ + (tag).relNumber = (rinfo).relNumber; \ + } while (false); + +#define BufTagGetNRelFileInfo(tag) \ + ((RelFileLocator) { \ + .spcOid = (tag).spcOid, \ + .dbOid = (tag).dbOid, \ + .relNumber = (tag).relNumber, \ + }) + +#define SMgrRelGetRelInfo(reln) \ + ((reln)->smgr_rlocator) + +#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers +#endif + +#endif //NEON_PGVERSIONCOMPAT_H diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 2889db49bc..d61f74b5c8 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -14,9 +14,10 @@ #define pageserver_h #include "postgres.h" +#include "neon_pgversioncompat.h" #include "access/xlogdefs.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/block.h" #include "storage/smgr.h" #include "lib/stringinfo.h" @@ -71,14 +72,14 @@ typedef struct typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } NeonExistsRequest; typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } NeonNblocksRequest; @@ -91,7 +92,7 @@ typedef struct typedef struct { NeonRequest req; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; } NeonGetPageRequest; @@ -164,7 +165,7 @@ extern char *neon_tenant; extern bool wal_redo; extern int32 max_cluster_size; -extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); @@ -175,19 +176,35 @@ extern void neon_open(SMgrRelation reln); extern void neon_close(SMgrRelation reln, ForkNumber forknum); extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); -extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_unlink(NRelFileInfoBackend rnode, ForkNumber forknum, bool isRedo); +#if PG_MAJORVERSION_NUM < 16 extern void neon_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nbuffers, bool skipFsync); +#endif + extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); + +#if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); - -extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer); - extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer); +extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer); +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +#endif extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); @@ -198,16 +215,22 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); /* utils for neon relsize cache */ extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); -extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); +extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); +extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); /* functions for local file cache */ -extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); -extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); -extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +#if PG_MAJORVERSION_NUM < 16 +extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + char *buffer); +#else +extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer); +#endif +extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno); extern void lfc_init(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 76d71dd94b..c39bae823c 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -58,7 +58,6 @@ #include "postmaster/autovacuum.h" #include "replication/walsender.h" #include "storage/bufmgr.h" -#include "storage/relfilenode.h" #include "storage/buf_internals.h" #include "storage/smgr.h" #include "storage/md.h" @@ -86,7 +85,10 @@ static char *hexdump_page(char *page); #endif -#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) +#define IS_LOCAL_REL(reln) (\ + NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \ + NInfoGetRelNumber(InfoFromSMgrRel(reln)) > FirstNormalObjectId \ +) const int SmgrTrace = DEBUG5; @@ -160,6 +162,7 @@ typedef enum PrefetchStatus { typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ XLogRecPtr effective_request_lsn; + XLogRecPtr actual_request_lsn; NeonResponse *response; /* may be null */ PrefetchStatus status; uint64 my_ring_index; @@ -255,7 +258,7 @@ static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, +static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); static bool @@ -314,6 +317,7 @@ compact_prefetch_buffers(void) target_slot->status = source_slot->status; target_slot->response = source_slot->response; target_slot->effective_request_lsn = source_slot->effective_request_lsn; + target_slot->actual_request_lsn = source_slot->actual_request_lsn; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -500,6 +504,11 @@ prefetch_wait_for(uint64 ring_index) { entry = GetPrfSlot(MyPState->ring_receive); +#if PG_MAJORVERSION_NUM >= 16 + /* ensure the log is actually flushed up to the request point */ + XLogFlush(entry->actual_request_lsn); +#endif + Assert(entry->status == PRFS_REQUESTED); if (!prefetch_read(entry)) return false; @@ -634,7 +643,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force .req.tag = T_NeonGetPageRequest, .req.latest = false, .req.lsn = 0, - .rnode = slot->buftag.rnode, + .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; @@ -643,13 +652,13 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force { request.req.lsn = *force_lsn; request.req.latest = *force_latest; - slot->effective_request_lsn = *force_lsn; + slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; } else { XLogRecPtr lsn = neon_get_request_lsn( &request.req.latest, - slot->buftag.rnode, + BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum ); @@ -671,7 +680,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * The best LSN to use for effective_request_lsn would be * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. */ - request.req.lsn = lsn; + slot->actual_request_lsn = request.req.lsn = lsn; prefetch_lsn = Max(prefetch_lsn, lsn); slot->effective_request_lsn = prefetch_lsn; } @@ -893,9 +902,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; @@ -906,9 +915,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); break; @@ -929,9 +938,9 @@ nm_pack_request(NeonRequest * msg) pq_sendbyte(&s, msg_req->req.latest); pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); pq_sendbyte(&s, msg_req->forknum); pq_sendint32(&s, msg_req->blkno); @@ -1063,10 +1072,7 @@ nm_to_string(NeonMessage * msg) NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -1079,10 +1085,7 @@ nm_to_string(NeonMessage * msg) NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); @@ -1095,10 +1098,7 @@ nm_to_string(NeonMessage * msg) NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); @@ -1187,13 +1187,13 @@ nm_to_string(NeonMessage * msg) * directly because it skips the logging if the LSN is new enough. */ static XLogRecPtr -log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, +log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { PGAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); - return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); + return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); } /* @@ -1210,9 +1210,14 @@ PageIsEmptyHeapPage(char *buffer) } static void -neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 + char *buffer, bool force) +#else + const char *buffer, bool force) +#endif { - XLogRecPtr lsn = PageGetLSN(buffer); + XLogRecPtr lsn = PageGetLSN((Page) buffer); if (ShutdownRequestPending) return; @@ -1232,15 +1237,14 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; - recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, + (Page) buffer, false); XLogFlush(recptr); lsn = recptr; ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } else if (lsn == InvalidXLogRecPtr) @@ -1263,24 +1267,20 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch * sign: it implies that the page was not WAL-logged, and its contents * will be lost when it's evicted. */ - if (PageIsNew(buffer)) + if (PageIsNew((Page) buffer)) { ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else if (PageIsEmptyHeapPage(buffer)) + else if (PageIsEmptyHeapPage((Page) buffer)) { ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } else @@ -1288,9 +1288,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch ereport(PANIC, (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } } @@ -1299,9 +1297,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch ereport(SmgrTrace, (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } @@ -1309,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum); } /* @@ -1379,7 +1375,7 @@ nm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; @@ -1394,7 +1390,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN /* * Get the last written LSN of this page. */ - lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = GetLastWrittenLSN(rinfo, forknum, blkno); lsn = nm_adjust_lsn(lsn); elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", @@ -1416,7 +1412,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN * so our request cannot concern those. */ *latest = true; - lsn = GetLastWrittenLSN(rnode, forknum, blkno); + lsn = GetLastWrittenLSN(rinfo, forknum, blkno); Assert(lsn != InvalidXLogRecPtr); elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); @@ -1485,7 +1481,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks)) { return true; } @@ -1500,20 +1496,26 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) * * For now, handle that special case here. */ +#if PG_MAJORVERSION_NUM >= 16 + if (reln->smgr_rlocator.locator.spcOid == 0 && + reln->smgr_rlocator.locator.dbOid == 0 && + reln->smgr_rlocator.locator.relNumber == 0) +#else if (reln->smgr_rnode.node.spcNode == 0 && reln->smgr_rnode.node.dbNode == 0 && reln->smgr_rnode.node.relNode == 0) +#endif { return false; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, .req.latest = latest, .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, + .rinfo = InfoFromSMgrRel(reln), .forknum = forkNum}; resp = page_server_request(&request); @@ -1529,9 +1531,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -1571,9 +1571,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } elog(SmgrTrace, "Create relation %u/%u/%u.%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); /* @@ -1597,12 +1595,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) */ if (isRedo) { - update_cached_relsize(reln->smgr_rnode.node, forkNum, 0); - get_cached_relsize(reln->smgr_rnode.node, forkNum, + update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); + get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &reln->smgr_cached_nblocks[forkNum]); } else - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1629,17 +1627,17 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo) { /* * Might or might not exist locally, depending on whether it's an unlogged * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to * unlink, it won't do any harm if the file doesn't exist. */ - mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) + mdunlink(rinfo, forkNum, isRedo); + if (!NRelFileInfoBackendIsTemp(rinfo)) { - forget_cached_relsize(rnode.node, forkNum); + forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum); } } @@ -1653,8 +1651,13 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void +#if PG_MAJORVERSION_NUM < 16 neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer, bool skipFsync) +#else +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + const void *buffer, bool skipFsync) +#endif { XLogRecPtr lsn; BlockNumber n_blocks = 0; @@ -1707,17 +1710,15 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); neon_wallog_page(reln, forkNum, blkno, buffer, false); - set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1); - lsn = PageGetLSN(buffer); + lsn = PageGetLSN((Page) buffer); elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer); + lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1732,11 +1733,98 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (lsn == InvalidXLogRecPtr) { lsn = GetXLogInsertRecPtr(); - SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, blkno); } - SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); } +#if PG_MAJORVERSION_NUM >= 16 +void +neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, + int nblocks, bool skipFsync) +{ + const PGAlignedBlock buffer = {0}; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + XLogRecPtr lsn = 0; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forkNum), + InvalidBlockNumber))); + + /* Don't log any pages if we're not allowed to do so. */ + if (!XLogInsertAllowed()) + return; + + while (remblocks > 0) + { + int count = Min(remblocks, XLR_MAX_BLOCK_ID); + + XLogBeginInsert(); + + for (int i = 0; i < count; i++) + XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i, + (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI); + + for (int i = 0; i < count; i++) + { + lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); + SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, + blocknum + i); + } + + blocknum += count; + remblocks -= count; + } + + Assert(lsn != 0); + + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum); + set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum); +} +#endif + /* * neon_open() -- Initialize newly-opened relation. */ @@ -1792,14 +1880,14 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum)) + if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) return false; tag = (BufferTag) { - .rnode = reln->smgr_rnode.node, .forkNum = forknum, .blockNum = blocknum }; + CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); ring_index = prefetch_register_buffer(tag, NULL, NULL); @@ -1851,9 +1939,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. */ -void -neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 +void PGDLLEXPORT +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer) +#else +void PGDLLEXPORT +neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer) +#endif { NeonResponse *resp; BufferTag buftag; @@ -1862,11 +1956,12 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, PrefetchRequest *slot; buftag = (BufferTag) { - .rnode = rnode, .forkNum = forkNum, .blockNum = blkno, }; + CopyNRelFileInfoToBufTag(buftag, rinfo); + /* * The redo process does not lock pages that it needs to replay but are * not in the shared buffers, so a concurrent process may request the @@ -1957,7 +2052,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, { case T_NeonGetPageResponse: memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); - lfc_write(rnode, forkNum, blkno, buffer); + lfc_write(rinfo, forkNum, blkno, buffer); break; case T_NeonErrorResponse: @@ -1965,9 +2060,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, (errcode(ERRCODE_IO_ERROR), errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", blkno, - rnode.spcNode, - rnode.dbNode, - rnode.relNode, + RelFileInfoFmt(rinfo), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -1987,7 +2080,11 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, */ void neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + void *buffer) +#endif { bool latest; XLogRecPtr request_lsn; @@ -2010,13 +2107,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, } /* Try to read from local file cache */ - if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer)) + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) { return; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); - neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2030,27 +2127,23 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked, mdbuf, BLCKSZ); - if (PageIsNew(mdbuf)) + if (PageIsNew((Page) mdbuf)) { - if (!PageIsNew(pageserver_masked)) + if (!PageIsNew((Page) pageserver_masked)) { elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(buffer)); } } - else if (PageIsNew(buffer)) + else if (PageIsNew((Page) buffer)) { elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf)); @@ -2065,9 +2158,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked), @@ -2086,9 +2177,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, hexdump_page(mdbuf_masked), @@ -2130,7 +2219,11 @@ hexdump_page(char *page) */ void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { XLogRecPtr lsn; @@ -2168,15 +2261,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, neon_wallog_page(reln, forknum, blocknum, buffer, false); - lsn = PageGetLSN(buffer); + lsn = PageGetLSN((Page) buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); - lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer); + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2212,23 +2303,21 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) { elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, n_blocks); return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, .req.latest = latest, .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, + .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2245,9 +2334,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn), errdetail("page server returned error: %s", @@ -2257,12 +2344,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) default: elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); } - update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, n_blocks); @@ -2281,7 +2366,7 @@ neon_dbsize(Oid dbNode) int64 db_size; XLogRecPtr request_lsn; bool latest; - RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; + NRelFileInfo dummy_node = {0}; request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { @@ -2350,7 +2435,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); /* * Truncating a relation drops all its buffers from the buffer cache @@ -2378,7 +2463,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * for the extended pages, so there's no harm in leaving behind obsolete * entries for the truncated chunks. */ - SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); + SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forknum); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2448,9 +2533,7 @@ neon_start_unlogged_build(SMgrRelation reln) ereport(SmgrTrace, (errmsg("starting unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromSMgrRel(reln))))); switch (reln->smgr_relpersistence) { @@ -2500,9 +2583,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) ereport(SmgrTrace, (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromSMgrRel(reln))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; @@ -2525,18 +2606,16 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) static void neon_end_unlogged_build(SMgrRelation reln) { + NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); + Assert(unlogged_build_rel == reln); ereport(SmgrTrace, (errmsg("ending unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); + RelFileInfoFmt(InfoFromNInfoB(rinfob))))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { - RelFileNodeBackend rnode; - Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); @@ -2544,19 +2623,17 @@ neon_end_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; /* Remove local copy */ - rnode = reln->smgr_rnode; + rinfob = InfoBFromSMgrRel(reln); for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", - rnode.node.spcNode, - rnode.node.dbNode, - rnode.node.relNode, + RelFileInfoFmt(InfoFromNInfoB(rinfob)), forknum); - forget_cached_relsize(rnode.node, forknum); + forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); mdclose(reln, forknum); /* use isRedo == true, so that we drop it immediately */ - mdunlink(rnode, forknum, true); + mdunlink(rinfob, forknum, true); } } @@ -2608,6 +2685,9 @@ static const struct f_smgr neon_smgr = .smgr_exists = neon_exists, .smgr_unlink = neon_unlink, .smgr_extend = neon_extend, +#if PG_MAJORVERSION_NUM >= 16 + .smgr_zeroextend = neon_zeroextend, +#endif .smgr_prefetch = neon_prefetch, .smgr_read = neon_read, .smgr_write = neon_write, @@ -2622,12 +2702,12 @@ static const struct f_smgr neon_smgr = }; const f_smgr * -smgr_neon(BackendId backend, RelFileNode rnode) +smgr_neon(BackendId backend, NRelFileInfo rinfo) { /* Don't use page server for temp relations */ if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); + return smgr_standard(backend, rinfo); else return &neon_smgr; } @@ -2681,7 +2761,7 @@ bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) { XLogRecPtr end_recptr = record->EndRecPtr; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blkno; BufferTag tag; @@ -2695,10 +2775,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) return true; #if PG_VERSION_NUM < 150000 - if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno)) elog(PANIC, "failed to locate backup block with ID %d", block_id); #else - XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno); + XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif /* @@ -2706,10 +2786,13 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) * regardless of whether the block is stored in shared buffers. * See also this function's top comment. */ - if (!OidIsValid(rnode.dbNode)) + if (!OidIsValid(NInfoGetDbOid(rinfo))) return false; - INIT_BUFFERTAG(tag, rnode, forknum, blkno); + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forknum; + tag.blockNum = blkno; + hash = BufTableHashCode(&tag); partitionLock = BufMappingPartitionLock(hash); @@ -2725,24 +2808,24 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) no_redo_needed = buffer < 0; /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno); + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); /* we don't have the buffer in memory, update lwLsn past this record, * also evict page fro file cache */ if (no_redo_needed) - lfc_evict(rnode, forknum, blkno); + lfc_evict(rinfo, forknum, blkno); LWLockRelease(partitionLock); /* Extend the relation if we know its size */ - if (get_cached_relsize(rnode, forknum, &relsize)) + if (get_cached_relsize(rinfo, forknum, &relsize)) { if (relsize < blkno + 1) { - update_cached_relsize(rnode, forknum, blkno + 1); - SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); + update_cached_relsize(rinfo, forknum, blkno + 1); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); } } else @@ -2763,7 +2846,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) .latest = false, .tag = T_NeonNblocksRequest, }, - .rnode = rnode, + .rinfo = rinfo, .forknum = forknum, }; @@ -2774,8 +2857,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) Assert(nbresponse->n_blocks > blkno); - set_cached_relsize(rnode, forknum, nbresponse->n_blocks); - SetLastWrittenLSNForRelation(end_recptr, rnode, forknum); + set_cached_relsize(rinfo, forknum, nbresponse->n_blocks); + SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks); } diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index d4262c730a..b13134b5c3 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -14,8 +14,10 @@ */ #include "postgres.h" +#include "neon_pgversioncompat.h" + #include "pagestore_client.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/smgr.h" #include "storage/lwlock.h" #include "storage/ipc.h" @@ -30,7 +32,7 @@ typedef struct { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; } RelTag; @@ -75,7 +77,7 @@ neon_smgr_shmem_startup(void) } bool -get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) { bool found = false; @@ -84,7 +86,7 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) RelTag tag; RelSizeEntry *entry; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_SHARED); entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); @@ -99,14 +101,14 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) } void -set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { RelTag tag; RelSizeEntry *entry; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); @@ -116,7 +118,7 @@ set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) } void -update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { if (relsize_hash_size > 0) { @@ -124,7 +126,7 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) RelSizeEntry *entry; bool found; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); @@ -135,13 +137,13 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) } void -forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum) { if (relsize_hash_size > 0) { RelTag tag; - tag.rnode = rnode; + tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7ca335945d..334c0bcce5 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -51,6 +51,9 @@ #include "libpq/pqformat.h" #include "replication/slot.h" #include "replication/walreceiver.h" +#if PG_VERSION_NUM >= 160000 +#include "replication/walsender_private.h" +#endif #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" #include "postmaster/postmaster.h" @@ -73,10 +76,10 @@ static bool syncSafekeepers = false; -char *wal_acceptors_list; -int wal_acceptor_reconnect_timeout; -int wal_acceptor_connection_timeout; -bool am_wal_proposer; +char *wal_acceptors_list = ""; +int wal_acceptor_reconnect_timeout = 1000; +int wal_acceptor_connection_timeout = 10000; +bool am_wal_proposer = false; #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" @@ -191,7 +194,7 @@ pg_init_walproposer(void) /* * Entry point for `postgres --sync-safekeepers`. */ -void +PGDLLEXPORT void WalProposerSync(int argc, char *argv[]) { struct stat stat_buf; @@ -315,7 +318,7 @@ nwp_shmem_startup_hook(void) /* * WAL proposer bgworker entry point. */ -void +PGDLLEXPORT void WalProposerMain(Datum main_arg) { #if PG_VERSION_NUM >= 150000 @@ -384,40 +387,89 @@ WalProposerPoll(void) while (true) { Safekeeper *sk; - int rc; + bool wait_timeout; WaitEvent event; TimestampTz now = GetCurrentTimestamp(); - rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), - &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); - sk = (Safekeeper *) event.user_data; +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv); +#endif /* - * If the event contains something that one of our safekeeper states - * was waiting for, we'll advance its state. + * Wait for a wait event to happen, or timeout: + * - Safekeeper socket can become available for READ or WRITE + * - Our latch got set, because + * * PG15-: We got woken up by a process triggering the WalSender + * * PG16+: WalSndCtl->wal_flush_cv was triggered */ - if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) - AdvancePollState(sk, event.events); - - /* - * If the timeout expired, attempt to reconnect to any safekeepers - * that we dropped - */ - ReconnectSafekeepers(); - - /* - * If wait is terminated by latch set (walsenders' latch is set on - * each wal flush), then exit loop. (no need for pm death check due to - * WL_EXIT_ON_PM_DEATH) - */ - if (rc != 0 && (event.events & WL_LATCH_SET)) + if (WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN) == 1) { - ResetLatch(MyLatch); - break; + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (event.events & WL_LATCH_SET) + { + /* Reset our latch */ + ResetLatch(MyLatch); + +#if PG_MAJORVERSION_NUM >= 16 + if (WalSndCtl != NULL) + ConditionVariableCancelSleep(); +#endif + break; + } + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (event.events & (WL_SOCKET_MASK)) + { + sk = (Safekeeper *) event.user_data; + AdvancePollState(sk, event.events); + } + else + pg_unreachable(); + } + else /* timeout expired */ + { +#if PG_MAJORVERSION_NUM >= 16 + /* First, cancel sleep - we might do some complex stuff after this */ + if (WalSndCtl != NULL) + ConditionVariableCancelSleep(); +#endif + + /* + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped + */ + ReconnectSafekeepers(); + wait_timeout = true; + + /* + * Ensure flushrecptr is set to a recent value. This fixes a case + * where we've not been notified of new WAL records when we were + * planning on consuming them. + */ + if (!syncSafekeepers) { + XLogRecPtr flushed; + +#if PG_MAJORVERSION_NUM < 15 + flushed = GetFlushRecPtr(); +#else + flushed = GetFlushRecPtr(NULL); +#endif + if (flushed != availableLsn) + break; + } } now = GetCurrentTimestamp(); - if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */ + if (wait_timeout || TimeToReconnect(now) <= 0) /* timeout expired: poll state */ { TimestampTz now; @@ -611,7 +663,8 @@ UpdateEventSet(Safekeeper *sk, uint32 events) ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); } -/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. +/* + * Hack: provides a way to remove the event corresponding to an individual walproposer from the set. * * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. */ @@ -1408,7 +1461,12 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec elog(FATAL, "could not append password to the safekeeper connection string"); } +#if PG_MAJORVERSION_NUM < 16 wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); +#else + wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); +#endif + if (!wrconn) { ereport(WARNING, @@ -2242,9 +2300,10 @@ HandleSafekeeperResponse(void) if (synced) n_synced++; } + if (n_synced >= quorum) { - /* All safekeepers synced! */ + /* A quorum of safekeepers has been synced! */ /* * Send empty message to broadcast latest truncateLsn to all safekeepers. diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 615fbf9399..fa1ba30a8f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -379,8 +379,8 @@ typedef struct Safekeeper AppendResponse appendResponse; /* feedback for master */ } Safekeeper; -extern void WalProposerSync(int argc, char *argv[]); -extern void WalProposerMain(Datum main_arg); +extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); +extern void PGDLLEXPORT WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); extern void ParsePageserverFeedbackMessage(StringInfo reply_message, diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index 9e1fc11756..05030360f6 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -25,6 +25,9 @@ #include "access/xlogutils.h" #include "access/xlogrecovery.h" #endif +#if PG_MAJORVERSION_NUM >= 16 +#include "utils/guc.h" +#endif /* * These variables are used similarly to openLogFile/SegNo, @@ -558,11 +561,11 @@ StartProposerReplication(StartReplicationCmd *cmd) static void WalSndLoop(void) { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + for (;;) { - /* Clear any already-pending wakeups */ - ResetLatch(MyLatch); - CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(); diff --git a/pgxn/neon_rmgr/Makefile b/pgxn/neon_rmgr/Makefile new file mode 100644 index 0000000000..20f0a78d79 --- /dev/null +++ b/pgxn/neon_rmgr/Makefile @@ -0,0 +1,19 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon_rmgr +OBJS = \ + $(WIN32RES) \ + neon_rmgr.o \ + neon_rmgr_decode.o \ + neon_rmgr_desc.o + + +EXTENSION = neon_rmgr +DATA = +PGFILEDESC = "Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_rmgr/neon_rmgr.c b/pgxn/neon_rmgr/neon_rmgr.c new file mode 100644 index 0000000000..496ca08c08 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.c @@ -0,0 +1,886 @@ +#include "postgres.h" +#include "fmgr.h" + +#if PG_MAJORVERSION_NUM >= 16 +#include "access/bufmask.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/neon_xlog.h" +#include "access/rmgr.h" +#include "access/visibilitymap.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/freespace.h" +#include "neon_rmgr.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + +static void neon_rm_redo(XLogReaderState *record); +static void neon_rm_startup(void); +static void neon_rm_cleanup(void); +static void neon_rm_mask(char *pagedata, BlockNumber blkno); + +static void redo_neon_heap_insert(XLogReaderState *record); +static void redo_neon_heap_delete(XLogReaderState *record); +static void redo_neon_heap_update(XLogReaderState *record, bool hot_update); +static void redo_neon_heap_lock(XLogReaderState *record); +static void redo_neon_heap_multi_insert(XLogReaderState *record); + +const static RmgrData NeonRmgr = { + .rm_name = "neon", + .rm_redo = neon_rm_redo, + .rm_desc = neon_rm_desc, + .rm_identify = neon_rm_identify, + .rm_startup = neon_rm_startup, + .rm_cleanup = neon_rm_cleanup, + .rm_mask = neon_rm_mask, + .rm_decode = neon_rm_decode, +}; + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + RegisterCustomRmgr(RM_NEON_ID, &NeonRmgr); +} + +static void +neon_rm_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_NEON_OPMASK) + { + case XLOG_NEON_HEAP_INSERT: + redo_neon_heap_insert(record); + break; + case XLOG_NEON_HEAP_DELETE: + redo_neon_heap_delete(record); + break; + case XLOG_NEON_HEAP_UPDATE: + redo_neon_heap_update(record, false); + break; + case XLOG_NEON_HEAP_HOT_UPDATE: + redo_neon_heap_update(record, true); + break; + case XLOG_NEON_HEAP_LOCK: + redo_neon_heap_lock(record); + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + redo_neon_heap_multi_insert(record); + break; + default: + elog(PANIC, "neon_rm_redo: unknown op code %u", info); + } +} + +static void +neon_rm_startup(void) +{ + /* nothing to do here */ +} + +static void +neon_rm_cleanup(void) +{ + /* nothing to do here */ +} + +static void +neon_rm_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions on the primary. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + + /* + * NB: Not ignoring ctid changes due to the tuple having moved + * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's + * important information that needs to be in-sync between primary + * and standby, and thus is WAL logged. + */ + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + + +/* + * COPIED FROM heapam.c + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ +static void +fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) +{ + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | + HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~HEAP_KEYS_UPDATED; + + if (infobits & XLHL_XMAX_IS_MULTI) + *infomask |= HEAP_XMAX_IS_MULTI; + if (infobits & XLHL_XMAX_LOCK_ONLY) + *infomask |= HEAP_XMAX_LOCK_ONLY; + if (infobits & XLHL_XMAX_EXCL_LOCK) + *infomask |= HEAP_XMAX_EXCL_LOCK; + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + + if (infobits & XLHL_KEYS_UPDATED) + *infomask2 |= HEAP_KEYS_UPDATED; +} + +static void +redo_neon_heap_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + xl_neon_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + RelFileLocator target_locator; + BlockNumber blkno; + ItemPointerData target_tid; + XLogRedoAction action; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size datalen; + char *data; + + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + data = XLogRecGetBlockData(record, 0, &datalen); + + newlen = datalen - SizeOfNeonHeapHeader; + Assert(datalen > SizeOfNeonHeapHeader && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader); + data += SizeOfNeonHeapHeader; + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + data, + newlen); + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + htup->t_ctid = target_tid; + + if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, + true, true) == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); +} + +static void +redo_neon_heap_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) XLogRecGetData(record); + Buffer buffer; + Page page; + ItemId lp = NULL; + HeapTupleHeader htup; + BlockNumber blkno; + RelFileLocator target_locator; + ItemPointerData target_tid; + + XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno); + ItemPointerSetBlockNumber(&target_tid, blkno); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(target_locator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = BufferGetPage(buffer); + + if (PageGetMaxOffsetNumber(page) >= xlrec->offnum) + lp = PageGetItemId(page, xlrec->offnum); + + if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->infobits_set, + &htup->t_infomask, &htup->t_infomask2); + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + else + HeapTupleHeaderSetXmin(htup, InvalidTransactionId); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* Make sure t_ctid is set correctly */ + if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE) + HeapTupleHeaderSetMovedPartitions(htup); + else + htup->t_ctid = target_tid; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +redo_neon_heap_update(XLogReaderState *record, bool hot_update) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) XLogRecGetData(record); + RelFileLocator rlocator; + BlockNumber oldblk; + BlockNumber newblk; + ItemPointerData newtid; + Buffer obuffer, + nbuffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleData oldtup; + HeapTupleHeader htup; + uint16 prefixlen = 0, + suffixlen = 0; + char *newp; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + xl_neon_heap_header xlhdr; + uint32 newlen; + Size freespace = 0; + XLogRedoAction oldaction; + XLogRedoAction newaction; + + /* initialize to keep the compiler quiet */ + oldtup.t_data = NULL; + oldtup.t_len = 0; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk); + if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL)) + { + /* HOT updates are never done across pages */ + Assert(!hot_update); + } + else + oldblk = newblk; + + ItemPointerSet(&newtid, newblk, xlrec->new_offnum); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, oldblk, &vmbuffer); + visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* + * In normal operation, it is important to lock the two pages in + * page-number order, to avoid possible deadlocks against other update + * operations going the other way. However, during WAL replay there can + * be no other update happening, so we don't need to worry about that. But + * we *do* need to worry that we don't expose an inconsistent state to Hot + * Standby queries --- so the original page can't be unlocked before we've + * added the new tuple to the new page. + */ + + /* Deal with old tuple version */ + oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1, + &obuffer); + if (oldaction == BLK_NEEDS_REDO) + { + page = BufferGetPage(obuffer); + offnum = xlrec->old_offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + oldtup.t_data = htup; + oldtup.t_len = ItemIdGetLength(lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); + fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, + &htup->t_infomask2); + HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + /* Set forward chain link in t_ctid */ + htup->t_ctid = newtid; + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page, XLogRecGetXid(record)); + + if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + PageSetLSN(page, lsn); + MarkBufferDirty(obuffer); + } + + /* + * Read the page the new tuple goes into, if different from old. + */ + if (oldblk == newblk) + { + nbuffer = obuffer; + newaction = oldaction; + } + else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) + { + nbuffer = XLogInitBufferForRedo(record, 0); + page = (Page) BufferGetPage(nbuffer); + PageInit(page, BufferGetPageSize(nbuffer), 0); + newaction = BLK_NEEDS_REDO; + } + else + newaction = XLogReadBufferForRedo(record, 0, &nbuffer); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, newblk, &vmbuffer); + visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + /* Deal with new tuple */ + if (newaction == BLK_NEEDS_REDO) + { + char *recdata; + char *recdata_end; + Size datalen; + Size tuplen; + + recdata = XLogRecGetBlockData(record, 0, &datalen); + recdata_end = recdata + datalen; + + page = BufferGetPage(nbuffer); + + offnum = xlrec->new_offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&prefixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD) + { + Assert(newblk == oldblk); + memcpy(&suffixlen, recdata, sizeof(uint16)); + recdata += sizeof(uint16); + } + + memcpy((char *) &xlhdr, recdata, SizeOfNeonHeapHeader); + recdata += SizeOfNeonHeapHeader; + + tuplen = recdata_end - recdata; + Assert(tuplen <= MaxHeapTupleSize); + + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + + /* + * Reconstruct the new tuple using the prefix and/or suffix from the + * old tuple, and the data stored in the WAL record. + */ + newp = (char *) htup + SizeofHeapTupleHeader; + if (prefixlen > 0) + { + int len; + + /* copy bitmap [+ padding] [+ oid] from WAL record */ + len = xlhdr.t_hoff - SizeofHeapTupleHeader; + memcpy(newp, recdata, len); + recdata += len; + newp += len; + + /* copy prefix from old tuple */ + memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen); + newp += prefixlen; + + /* copy new tuple data from WAL record */ + len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader); + memcpy(newp, recdata, len); + recdata += len; + newp += len; + } + else + { + /* + * copy bitmap [+ padding] [+ oid] + data from record, all in one + * go + */ + memcpy(newp, recdata, tuplen); + recdata += tuplen; + newp += tuplen; + } + Assert(recdata == recdata_end); + + /* copy suffix from old tuple */ + if (suffixlen > 0) + memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen); + + newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen; + htup->t_infomask2 = xlhdr.t_infomask2; + htup->t_infomask = xlhdr.t_infomask; + htup->t_hoff = xlhdr.t_hoff; + + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); + HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); + /* Make sure there is no forward chain link in t_ctid */ + htup->t_ctid = newtid; + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + + if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + MarkBufferDirty(nbuffer); + } + + if (BufferIsValid(nbuffer) && nbuffer != obuffer) + UnlockReleaseBuffer(nbuffer); + if (BufferIsValid(obuffer)) + UnlockReleaseBuffer(obuffer); + + /* + * If the new page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * However, don't update the FSM on HOT updates, because after crash + * recovery, either the old or the new tuple will certainly be dead and + * prunable. After pruning, the page will have roughly as much free space + * as it did before the update, assuming the new tuple is about the same + * size as the old one. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); +} + +static void +redo_neon_heap_lock(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId lp = NULL; + HeapTupleHeader htup; + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED) + { + RelFileLocator rlocator; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + Relation reln; + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block); + reln = CreateFakeRelcacheEntry(rlocator); + + visibilitymap_pin(reln, block, &vmbuffer); + visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN); + + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) >= offnum) + lp = PageGetItemId(page, offnum); + + if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) + elog(PANIC, "neon_rm_redo: invalid lp"); + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~HEAP_KEYS_UPDATED; + fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask, + &htup->t_infomask2); + + /* + * Clear relevant update flags, but only if the modified infomask says + * there's no update. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask)) + { + HeapTupleHeaderClearHotUpdated(htup); + /* Make sure there is no forward chain link in t_ctid */ + ItemPointerSet(&htup->t_ctid, + BufferGetBlockNumber(buffer), + offnum); + } + HeapTupleHeaderSetXmax(htup, xlrec->xmax); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +static void +redo_neon_heap_multi_insert(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_neon_heap_multi_insert *xlrec; + RelFileLocator rlocator; + BlockNumber blkno; + Buffer buffer; + Page page; + union + { + HeapTupleHeaderData hdr; + char data[MaxHeapTupleSize]; + } tbuf; + HeapTupleHeader htup; + uint32 newlen; + Size freespace = 0; + int i; + bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0; + XLogRedoAction action; + + /* + * Insertion doesn't overwrite MVCC data, so no conflict processing is + * required. + */ + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); + + /* check that the mutually exclusive flags are not both set */ + Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) && + (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET))); + + /* + * The visibility map may need to be fixed even if the heap page is + * already up-to-date. + */ + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + { + Relation reln = CreateFakeRelcacheEntry(rlocator); + Buffer vmbuffer = InvalidBuffer; + + visibilitymap_pin(reln, blkno, &vmbuffer); + visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); + ReleaseBuffer(vmbuffer); + FreeFakeRelcacheEntry(reln); + } + + if (isinit) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + PageInit(page, BufferGetPageSize(buffer), 0); + action = BLK_NEEDS_REDO; + } + else + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + char *tupdata; + char *endptr; + Size len; + + /* Tuples are stored as block data */ + tupdata = XLogRecGetBlockData(record, 0, &len); + endptr = tupdata + len; + + page = (Page) BufferGetPage(buffer); + + for (i = 0; i < xlrec->ntuples; i++) + { + OffsetNumber offnum; + xl_neon_multi_insert_tuple *xlhdr; + + /* + * If we're reinitializing the page, the tuples are stored in + * order from FirstOffsetNumber. Otherwise there's an array of + * offsets in the WAL record, and the tuples come after that. + */ + if (isinit) + offnum = FirstOffsetNumber + i; + else + offnum = xlrec->offsets[i]; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "neon_rm_redo: invalid max offset number"); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(tupdata); + tupdata = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + + newlen = xlhdr->datalen; + Assert(newlen <= MaxHeapTupleSize); + htup = &tbuf.hdr; + MemSet((char *) htup, 0, SizeofHeapTupleHeader); + /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */ + memcpy((char *) htup + SizeofHeapTupleHeader, + (char *) tupdata, + newlen); + tupdata += newlen; + + newlen += SizeofHeapTupleHeader; + htup->t_infomask2 = xlhdr->t_infomask2; + htup->t_infomask = xlhdr->t_infomask; + htup->t_hoff = xlhdr->t_hoff; + HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); + HeapTupleHeaderSetCmin(htup, xlrec->t_cid); + ItemPointerSetBlockNumber(&htup->t_ctid, blkno); + ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); + + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "neon_rm_redo: failed to add tuple"); + } + if (tupdata != endptr) + elog(PANIC, "neon_rm_redo: total tuple length mismatch"); + + freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */ + + PageSetLSN(page, lsn); + + if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) + PageClearAllVisible(page); + + /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ + if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) + PageSetAllVisible(page); + + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* + * If the page is running low on free space, update the FSM as well. + * Arbitrarily, our definition of "low" is less than 20%. We can't do much + * better than that without knowing the fill-factor for the table. + * + * XXX: Don't do this if the page was restored from full page image. We + * don't bother to update the FSM in that case, it doesn't need to be + * totally accurate anyway. + */ + if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); +} + +#else +/* safeguard for older PostgreSQL versions */ +PG_MODULE_MAGIC; +#endif diff --git a/pgxn/neon_rmgr/neon_rmgr.control b/pgxn/neon_rmgr/neon_rmgr.control new file mode 100644 index 0000000000..d2bbb1b323 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.control @@ -0,0 +1,4 @@ +# neon_rmgr extension +comment = 'Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)' +default_version = '1.0' +module_pathname = '$libdir/neon_rmgr' diff --git a/pgxn/neon_rmgr/neon_rmgr.h b/pgxn/neon_rmgr/neon_rmgr.h new file mode 100644 index 0000000000..2c26a928ad --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr.h @@ -0,0 +1,13 @@ +#ifndef NEON_RMGR_H +#define NEON_RMGR_H +#if PG_MAJORVERSION_NUM >= 16 +#include "access/xlog_internal.h" +#include "replication/decode.h" +#include "replication/logical.h" + +extern void neon_rm_desc(StringInfo buf, XLogReaderState *record); +extern void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +extern const char *neon_rm_identify(uint8 info); + +#endif +#endif //NEON_RMGR_H diff --git a/pgxn/neon_rmgr/neon_rmgr_decode.c b/pgxn/neon_rmgr/neon_rmgr_decode.c new file mode 100644 index 0000000000..f327e132e9 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr_decode.c @@ -0,0 +1,404 @@ +#include "postgres.h" + +#if PG_MAJORVERSION_NUM >= 16 +#include "access/heapam_xlog.h" +#include "access/neon_xlog.h" +#include "replication/decode.h" +#include "replication/logical.h" +#include "replication/snapbuild.h" + +#include "neon_rmgr.h" + +/* individual record(group)'s handlers */ +static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); +static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf); + +/* common function to decode tuples */ +static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple); + + +void +neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + uint8 info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK; + TransactionId xid = XLogRecGetXid(buf->record); + SnapBuild *builder = ctx->snapshot_builder; + + ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); + + /* + * If we don't have snapshot or we are just fast-forwarding, there is no + * point in decoding data changes. + */ + if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT || + ctx->fast_forward) + return; + + switch (info) + { + case XLOG_NEON_HEAP_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonInsert(ctx, buf); + break; + case XLOG_NEON_HEAP_DELETE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonDelete(ctx, buf); + break; + case XLOG_NEON_HEAP_UPDATE: + case XLOG_NEON_HEAP_HOT_UPDATE: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonUpdate(ctx, buf); + break; + case XLOG_NEON_HEAP_LOCK: + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + if (SnapBuildProcessChange(builder, xid, buf->origptr)) + DecodeNeonMultiInsert(ctx, buf); + break; + default: + elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info); + break; + } +} + +static inline bool +FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id) +{ + if (ctx->callbacks.filter_by_origin_cb == NULL) + return false; + + return filter_by_origin_cb_wrapper(ctx, origin_id); +} + +/* + * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. + * + * Deletes can contain the new tuple. + */ +static void +DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + Size datalen; + char *tupledata; + Size tuplelen; + XLogReaderState *r = buf->record; + xl_neon_heap_insert *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples (this does happen when + * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) + change->action = REORDER_BUFFER_CHANGE_INSERT; + else + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + tupledata = XLogRecGetBlockData(r, 0, &datalen); + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, + xlrec->flags & XLH_INSERT_ON_TOAST_RELATION); +} + +/* + * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. + * + * Deletes can possibly contain the old primary key. + */ +static void +DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_delete *xlrec; + ReorderBufferChange *change; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_delete *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + + if (xlrec->flags & XLH_DELETE_IS_SUPER) + change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT; + else + change->action = REORDER_BUFFER_CHANGE_DELETE; + + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + /* old primary key stored */ + if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) + { + Size datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader; + Size tuplelen = datalen - SizeOfNeonHeapHeader; + + Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader)); + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete, + datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout + * in the record, from wal into proper tuplebufs. + * + * Updates can possibly contain a new tuple and the old primary key. + */ +static void +DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_update *xlrec; + ReorderBufferChange *change; + char *data; + RelFileLocator target_locator; + + xlrec = (xl_neon_heap_update *) XLogRecGetData(r); + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL); + if (target_locator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_UPDATE; + change->origin_id = XLogRecGetOrigin(r); + memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator)); + + if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) + { + Size datalen; + Size tuplelen; + + data = XLogRecGetBlockData(r, 0, &datalen); + + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.newtuple); + } + + if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) + { + Size datalen; + Size tuplelen; + + /* caution, remaining data in record is not aligned */ + data = XLogRecGetData(r) + SizeOfNeonHeapUpdate; + datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate; + tuplelen = datalen - SizeOfNeonHeapHeader; + + change->data.tp.oldtuple = + ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); + + DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); + } + + change->data.tp.clear_toast_afterwards = true; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, + change, false); +} + +/* + * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. + * + * Currently MULTI_INSERT will always contain the full tuples. + */ +static void +DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) +{ + XLogReaderState *r = buf->record; + xl_neon_heap_multi_insert *xlrec; + int i; + char *data; + char *tupledata; + Size tuplelen; + RelFileLocator rlocator; + + xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r); + + /* + * Ignore insert records without new tuples. This happens when a + * multi_insert is done on a catalog or on a non-persistent relation. + */ + if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) + return; + + /* only interested in our database */ + XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL); + if (rlocator.dbOid != ctx->slot->data.database) + return; + + /* output plugin doesn't look for this origin, no need to queue */ + if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) + return; + + /* + * We know that this multi_insert isn't for a catalog, so the block should + * always have data even if a full-page write of it is taken. + */ + tupledata = XLogRecGetBlockData(r, 0, &tuplelen); + Assert(tupledata != NULL); + + data = tupledata; + for (i = 0; i < xlrec->ntuples; i++) + { + ReorderBufferChange *change; + xl_neon_multi_insert_tuple *xlhdr; + int datalen; + ReorderBufferTupleBuf *tuple; + HeapTupleHeader header; + + change = ReorderBufferGetChange(ctx->reorder); + change->action = REORDER_BUFFER_CHANGE_INSERT; + change->origin_id = XLogRecGetOrigin(r); + + memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator)); + + xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data); + data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple; + datalen = xlhdr->datalen; + + change->data.tp.newtuple = + ReorderBufferGetTupleBuf(ctx->reorder, datalen); + + tuple = change->data.tp.newtuple; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* + * We can only figure this out after reassembling the transactions. + */ + tuple->tuple.t_tableOid = InvalidOid; + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader, + (char *) data, + datalen); + header->t_infomask = xlhdr->t_infomask; + header->t_infomask2 = xlhdr->t_infomask2; + header->t_hoff = xlhdr->t_hoff; + + /* + * Reset toast reassembly state only after the last row in the last + * xl_multi_insert_tuple record emitted by one heap_multi_insert() + * call. + */ + if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && + (i + 1) == xlrec->ntuples) + change->data.tp.clear_toast_afterwards = true; + else + change->data.tp.clear_toast_afterwards = false; + + ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), + buf->origptr, change, false); + + /* move to the next xl_neon_multi_insert_tuple entry */ + data += datalen; + } + Assert(data == tupledata + tuplelen); +} + +/* + * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete + * (but not by heap_multi_insert) into a tuplebuf. + * + * The size 'len' and the pointer 'data' in the record need to be + * computed outside as they are record specific. + */ +static void +DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple) +{ + xl_neon_heap_header xlhdr; + int datalen = len - SizeOfNeonHeapHeader; + HeapTupleHeader header; + + Assert(datalen >= 0); + + tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; + header = tuple->tuple.t_data; + + /* not a disk based tuple */ + ItemPointerSetInvalid(&tuple->tuple.t_self); + + /* we can only figure this out after reassembling the transactions */ + tuple->tuple.t_tableOid = InvalidOid; + + /* data is not stored aligned, copy to aligned storage */ + memcpy((char *) &xlhdr, + data, + SizeOfNeonHeapHeader); + + memset(header, 0, SizeofHeapTupleHeader); + + memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader, + data + SizeOfNeonHeapHeader, + datalen); + + header->t_infomask = xlhdr.t_infomask; + header->t_infomask2 = xlhdr.t_infomask2; + header->t_hoff = xlhdr.t_hoff; +} + + +#endif \ No newline at end of file diff --git a/pgxn/neon_rmgr/neon_rmgr_desc.c b/pgxn/neon_rmgr/neon_rmgr_desc.c new file mode 100644 index 0000000000..8901c85ba2 --- /dev/null +++ b/pgxn/neon_rmgr/neon_rmgr_desc.c @@ -0,0 +1,181 @@ +#include "postgres.h" +#if PG_MAJORVERSION_NUM >= 16 +#include "access/heapam_xlog.h" +#include "access/neon_xlog.h" +#include "access/rmgr.h" +#include "access/rmgrdesc_utils.h" +#include "access/xlog_internal.h" +#include "miscadmin.h" +#include "storage/buf.h" +#include "storage/bufpage.h" + +#include "neon_rmgr.h" + +/* + * NOTE: "keyname" argument cannot have trailing spaces or punctuation + * characters + */ +static void +infobits_desc(StringInfo buf, uint8 infobits, const char *keyname) +{ + appendStringInfo(buf, "%s: [", keyname); + + Assert(buf->data[buf->len - 1] != ' '); + + if (infobits & XLHL_XMAX_IS_MULTI) + appendStringInfoString(buf, "IS_MULTI, "); + if (infobits & XLHL_XMAX_LOCK_ONLY) + appendStringInfoString(buf, "LOCK_ONLY, "); + if (infobits & XLHL_XMAX_EXCL_LOCK) + appendStringInfoString(buf, "EXCL_LOCK, "); + if (infobits & XLHL_XMAX_KEYSHR_LOCK) + appendStringInfoString(buf, "KEYSHR_LOCK, "); + if (infobits & XLHL_KEYS_UPDATED) + appendStringInfoString(buf, "KEYS_UPDATED, "); + + if (buf->data[buf->len - 1] == ' ') + { + /* Truncate-away final unneeded ", " */ + Assert(buf->data[buf->len - 2] == ','); + buf->len -= 2; + buf->data[buf->len] = '\0'; + } + + appendStringInfoString(buf, "]"); +} + +static void +truncate_flags_desc(StringInfo buf, uint8 flags) +{ + appendStringInfoString(buf, "flags: ["); + + if (flags & XLH_TRUNCATE_CASCADE) + appendStringInfoString(buf, "CASCADE, "); + if (flags & XLH_TRUNCATE_RESTART_SEQS) + appendStringInfoString(buf, "RESTART_SEQS, "); + + if (buf->data[buf->len - 1] == ' ') + { + /* Truncate-away final unneeded ", " */ + Assert(buf->data[buf->len - 2] == ','); + buf->len -= 2; + buf->data[buf->len] = '\0'; + } + + appendStringInfoString(buf, "]"); +} + +void +neon_rm_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + info &= XLOG_NEON_OPMASK; + + if (info == XLOG_NEON_HEAP_INSERT) + { + xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) rec; + + appendStringInfo(buf, "off: %u, flags: 0x%02X", + xlrec->offnum, + xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_DELETE) + { + xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) rec; + + appendStringInfo(buf, "xmax: %u, off: %u, ", + xlrec->xmax, xlrec->offnum); + infobits_desc(buf, xlrec->infobits_set, "infobits"); + appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_UPDATE) + { + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; + + appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", + xlrec->old_xmax, xlrec->old_offnum); + infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); + appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", + xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); + } + else if (info == XLOG_NEON_HEAP_HOT_UPDATE) + { + xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec; + + appendStringInfo(buf, "old_xmax: %u, old_off: %u, ", + xlrec->old_xmax, xlrec->old_offnum); + infobits_desc(buf, xlrec->old_infobits_set, "old_infobits"); + appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u", + xlrec->flags, xlrec->new_xmax, xlrec->new_offnum); + } + else if (info == XLOG_NEON_HEAP_LOCK) + { + xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) rec; + + appendStringInfo(buf, "xmax: %u, off: %u, ", + xlrec->xmax, xlrec->offnum); + infobits_desc(buf, xlrec->infobits_set, "infobits"); + appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags); + } + else if (info == XLOG_NEON_HEAP_MULTI_INSERT) + { + xl_neon_heap_multi_insert *xlrec = (xl_neon_heap_multi_insert *) rec; + bool isinit = (XLogRecGetInfo(record) & XLOG_NEON_INIT_PAGE) != 0; + + appendStringInfo(buf, "ntuples: %d, flags: 0x%02X", xlrec->ntuples, + xlrec->flags); + + if (XLogRecHasBlockData(record, 0) && !isinit) + { + appendStringInfoString(buf, ", offsets:"); + array_desc(buf, xlrec->offsets, sizeof(OffsetNumber), + xlrec->ntuples, &offset_elem_desc, NULL); + } + } +} + +const char * +neon_rm_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_NEON_HEAP_INSERT: + id = "INSERT"; + break; + case XLOG_NEON_HEAP_INSERT | XLOG_NEON_INIT_PAGE: + id = "INSERT+INIT"; + break; + case XLOG_NEON_HEAP_DELETE: + id = "DELETE"; + break; + case XLOG_NEON_HEAP_UPDATE: + id = "UPDATE"; + break; + case XLOG_NEON_HEAP_UPDATE | XLOG_NEON_INIT_PAGE: + id = "UPDATE+INIT"; + break; + case XLOG_NEON_HEAP_HOT_UPDATE: + id = "HOT_UPDATE"; + break; + case XLOG_NEON_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE: + id = "HOT_UPDATE+INIT"; + break; + case XLOG_NEON_HEAP_LOCK: + id = "LOCK"; + break; + case XLOG_NEON_HEAP_MULTI_INSERT: + id = "MULTI_INSERT"; + break; + case XLOG_NEON_HEAP_MULTI_INSERT | XLOG_NEON_INIT_PAGE: + id = "MULTI_INSERT+INIT"; + break; + } + + return id; +} + +#endif diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index e0cea4177b..aa644efd40 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -10,6 +10,8 @@ */ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" @@ -39,8 +41,13 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); * Linkage to functions in neon module. * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c */ -typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 +typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, XLogRecPtr request_lsn, bool request_latest, char *buffer); +#else +typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, void *buffer); +#endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -115,7 +122,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) uint32 buf_state; Buffer bufferid; bool isvalid; - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blocknum; @@ -128,7 +135,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) else isvalid = false; bufferid = BufferDescriptorGetBuffer(bufHdr); - rnode = bufHdr->tag.rnode; + rinfo = BufTagGetNRelFileInfo(bufHdr->tag); forknum = bufHdr->tag.forkNum; blocknum = bufHdr->tag.blockNum; @@ -141,7 +148,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS) */ if (isvalid) { - if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + if (ReadRecentBuffer(rinfo, forknum, blocknum, bufferid)) ReleaseBuffer(bufferid); } } @@ -238,7 +245,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); relation_close(rel, AccessShareLock); @@ -267,10 +274,17 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) PG_RETURN_NULL(); { - RelFileNode rnode = { + NRelFileInfo rinfo = { +#if PG_MAJORVERSION_NUM < 16 .spcNode = PG_GETARG_OID(0), .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2)}; + .relNode = PG_GETARG_OID(2) +#else + .spcOid = PG_GETARG_OID(0), + .dbOid = PG_GETARG_OID(1), + .relNumber = PG_GETARG_OID(2) +#endif + }; ForkNumber forknum = PG_GETARG_UINT32(3); @@ -284,7 +298,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c index 2219543628..4e604a710c 100644 --- a/pgxn/neon_walredo/inmem_smgr.c +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -18,10 +18,12 @@ */ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include "access/xlog.h" #include "storage/block.h" #include "storage/buf_internals.h" -#include "storage/relfilenode.h" +#include RELFILEINFO_HDR #include "storage/smgr.h" #if PG_VERSION_NUM >= 150000 @@ -43,10 +45,12 @@ static int used_pages; static int locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) { + NRelFileInfo rinfo = InfoFromSMgrRel(reln); + /* We only hold a small number of pages, so linear search */ for (int i = 0; i < used_pages; i++) { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum && blkno == page_tag[i].blockNum) { @@ -63,15 +67,26 @@ static void inmem_open(SMgrRelation reln); static void inmem_close(SMgrRelation reln, ForkNumber forknum); static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); -static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -static void inmem_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); +static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo); static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +#if PG_MAJORVERSION_NUM < 16 +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +#else +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); +static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer); +static void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +#endif static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); @@ -95,9 +110,11 @@ inmem_init(void) static bool inmem_exists(SMgrRelation reln, ForkNumber forknum) { + NRelFileInfo rinfo = InfoFromSMgrRel(reln); + for (int i = 0; i < used_pages; i++) { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i])) && forknum == page_tag[i].forkNum) { return true; @@ -120,7 +137,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) * inmem_unlink() -- Unlink a relation. */ static void -inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo) { } @@ -135,12 +152,28 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) */ static void inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { /* same as smgwrite() for us */ inmem_write(reln, forknum, blkno, buffer, skipFsync); } +#if PG_MAJORVERSION_NUM >= 16 +static void +inmem_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + char buffer[BLCKSZ] = {0}; + + for (int i = 0; i < nblocks; i++) + inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync); +} +#endif + /* * inmem_open() -- Initialize newly-opened relation. */ @@ -180,7 +213,11 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum, */ static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +#if PG_MAJORVERSION_NUM < 16 char *buffer) +#else + void *buffer) +#endif { int pg; @@ -200,7 +237,11 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ static void inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +#if PG_MAJORVERSION_NUM < 16 char *buffer, bool skipFsync) +#else + const void *buffer, bool skipFsync) +#endif { int pg; @@ -216,9 +257,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages); @@ -227,14 +266,13 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, pg = used_pages; used_pages++; - INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + + InitBufferTag(&page_tag[pg], &InfoFromSMgrRel(reln), forknum, blocknum); } else { elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, + RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, used_pages); @@ -287,6 +325,9 @@ static const struct f_smgr inmem_smgr = .smgr_exists = inmem_exists, .smgr_unlink = inmem_unlink, .smgr_extend = inmem_extend, +#if PG_MAJORVERSION_NUM >= 16 + .smgr_zeroextend = inmem_zeroextend, +#endif .smgr_prefetch = inmem_prefetch, .smgr_read = inmem_read, .smgr_write = inmem_write, @@ -297,11 +338,11 @@ static const struct f_smgr inmem_smgr = }; const f_smgr * -smgr_inmem(BackendId backend, RelFileNode rnode) +smgr_inmem(BackendId backend, NRelFileInfo rinfo) { Assert(InRecovery); if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); + return smgr_standard(backend, rinfo); else return &inmem_smgr; } diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h index af7c3fe6cc..58b98b8e6a 100644 --- a/pgxn/neon_walredo/inmem_smgr.h +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -11,7 +11,7 @@ #ifndef INMEM_SMGR_H #define INMEM_SMGR_H -extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_inmem(void); #endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index 9cce9b2a67..01e12983a6 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -43,6 +43,8 @@ #include "postgres.h" +#include "../neon/neon_pgversioncompat.h" + #include #include #include @@ -61,9 +63,11 @@ #include #endif +#if PG_MAJORVERSION_NUM < 16 #ifndef HAVE_GETRUSAGE #include "rusagestub.h" #endif +#endif #include "access/clog.h" #include "access/commit_ts.h" @@ -187,7 +191,7 @@ enter_seccomp_mode(void) * backend processes. Some initialization was done in CallExtMain * already. */ -void +PGDLLEXPORT void WalRedoMain(int argc, char *argv[]) { int firstchar; @@ -200,7 +204,7 @@ WalRedoMain(int argc, char *argv[]) /* * WAL redo does not need a large number of buffers. And speed of - * DropRelFileNodeAllLocalBuffers() is proportional to the number of + * DropRelationAllLocalBuffers() is proportional to the number of * buffers. So let's keep it small (default value is 1024) */ num_temp_buffers = 4; @@ -212,6 +216,12 @@ WalRedoMain(int argc, char *argv[]) smgr_hook = smgr_inmem; smgr_init_hook = smgr_init_inmem; +#if PG_VERSION_NUM >= 160000 + /* make rmgr registry believe we can register the resource manager */ + process_shared_preload_libraries_in_progress = true; + load_file("$libdir/neon_rmgr", false); + process_shared_preload_libraries_in_progress = false; +#endif /* Initialize MaxBackends (if under postmaster, was done already) */ MaxConnections = 1; @@ -300,6 +310,9 @@ WalRedoMain(int argc, char *argv[]) */ MemoryContextSwitchTo(MessageContext); initStringInfo(&input_message); +#if PG_MAJORVERSION_NUM >= 16 + MyBackendType = B_BACKEND; +#endif for (;;) { @@ -534,16 +547,16 @@ CreateFakeSharedMemoryAndSemaphores() /* Version compatility wrapper for ReadBufferWithoutRelcache */ static inline Buffer -NeonRedoReadBuffer(RelFileNode rnode, +NeonRedoReadBuffer(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) { #if PG_VERSION_NUM >= 150000 - return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL, /* no strategy */ true); /* WAL redo is only performed on permanent rels */ #else - return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode, NULL); /* no strategy */ #endif } @@ -647,7 +660,7 @@ ReadRedoCommand(StringInfo inBuf) static void BeginRedoForBlock(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; SMgrRelation reln; @@ -662,22 +675,26 @@ BeginRedoForBlock(StringInfo input_message) * BlockNumber */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); wal_redo_buffer = InvalidBuffer; - INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); + InitBufferTag(&target_redo_tag, &rinfo, forknum, blknum); elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", - target_redo_tag.rnode.spcNode, - target_redo_tag.rnode.dbNode, - target_redo_tag.rnode.relNode, + RelFileInfoFmt(rinfo), target_redo_tag.forkNum, target_redo_tag.blockNum); - reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT); if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || reln->smgr_cached_nblocks[forknum] < blknum + 1) { @@ -691,7 +708,7 @@ BeginRedoForBlock(StringInfo input_message) static void PushPage(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; const char *content; @@ -709,13 +726,19 @@ PushPage(StringInfo input_message) * 8k page content */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); content = pq_getmsgbytes(input_message, BLCKSZ); - buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK); + buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_ZERO_AND_LOCK); wal_redo_buffer = buf; page = BufferGetPage(buf); memcpy(page, content, BLCKSZ); @@ -831,7 +854,7 @@ ApplyRecord(StringInfo input_message) */ if (BufferIsInvalid(wal_redo_buffer)) { - wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode, + wal_redo_buffer = NeonRedoReadBuffer(BufTagGetNRelFileInfo(target_redo_tag), target_redo_tag.forkNum, target_redo_tag.blockNum, RBM_NORMAL); @@ -878,26 +901,29 @@ static bool redo_block_filter(XLogReaderState *record, uint8 block_id) { BufferTag target_tag; + NRelFileInfo rinfo; #if PG_VERSION_NUM >= 150000 XLogRecGetBlockTag(record, block_id, - &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); + &rinfo, &target_tag.forkNum, &target_tag.blockNum); #else if (!XLogRecGetBlockTag(record, block_id, - &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum)) + &rinfo, &target_tag.forkNum, &target_tag.blockNum)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } #endif + CopyNRelFileInfoToBufTag(target_tag, rinfo); /* * Can a WAL redo function ever access a relation other than the one that * it modifies? I don't see why it would. + * Custom RMGRs may be affected by this. */ - if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) + if (!RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(target_redo_tag))) elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", - target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); + RelFileInfoFmt(rinfo), target_tag.forkNum, target_tag.blockNum); /* * If this block isn't one we are currently restoring, then return 'true' @@ -914,7 +940,7 @@ redo_block_filter(XLogReaderState *record, uint8 block_id) static void GetPage(StringInfo input_message) { - RelFileNode rnode; + NRelFileInfo rinfo; ForkNumber forknum; BlockNumber blknum; Buffer buf; @@ -931,14 +957,20 @@ GetPage(StringInfo input_message) * BlockNumber */ forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); +#if PG_MAJORVERSION_NUM < 16 + rinfo.spcNode = pq_getmsgint(input_message, 4); + rinfo.dbNode = pq_getmsgint(input_message, 4); + rinfo.relNode = pq_getmsgint(input_message, 4); +#else + rinfo.spcOid = pq_getmsgint(input_message, 4); + rinfo.dbOid = pq_getmsgint(input_message, 4); + rinfo.relNumber = pq_getmsgint(input_message, 4); +#endif blknum = pq_getmsgint(input_message, 4); /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ - buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL); + buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_NORMAL); Assert(buf == wal_redo_buffer); page = BufferGetPage(buf); /* single thread, so don't bother locking the page */ @@ -961,7 +993,7 @@ GetPage(StringInfo input_message) } while (tot_written < BLCKSZ); ReleaseBuffer(buf); - DropRelFileNodeAllLocalBuffers(rnode); + DropRelationAllLocalBuffers(rinfo); wal_redo_buffer = InvalidBuffer; elog(TRACE, "Page sent back for block %u", blknum); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index d728312de4..4ee66ddc8e 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context, Result}; use bytes::Bytes; use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; -use postgres_ffi::{XLogSegNo, PG_TLI}; +use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use std::cmp::{max, min}; use std::io::{self, SeekFrom}; @@ -138,19 +138,13 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - match state.server.pg_version / 10000 { - 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )?, - 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( - &timeline_dir, - wal_seg_size, - state.commit_lsn, - )?, - _ => bail!("unsupported postgres version: {}", state.server.pg_version), - } + let version = state.server.pg_version / 10000; + + dispatch_pgversion!( + version, + pgv::xlog_utils::find_end_of_wal(&timeline_dir, wal_seg_size, state.commit_lsn,)?, + bail!("unsupported postgres version: {}", version) + ) }; // TODO: do we really know that write_lsn is fully flushed to disk? diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 272c32fdcd..7498d5ac05 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1583,6 +1583,7 @@ class NeonPageserver(PgProtocol): ".*took more than expected to complete.*", # these can happen during shutdown, but it should not be a reason to fail a test ".*completed, took longer than expected.*", + '.*registered custom resource manager "neon".*', ] def start( diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index b61f52be3c..657718da00 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -17,6 +17,7 @@ This fixture is used to determine which version of Postgres to use for tests. class PgVersion(str, enum.Enum): V14 = "14" V15 = "15" + V16 = "16" # Instead of making version an optional parameter in methods, we can use this fake entry # to explicitly rely on the default server version (could be different from pg_version fixture value) NOT_SET = "<-POSTRGRES VERSION IS NOT SET->" diff --git a/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json new file mode 100644 index 0000000000..1157e0d032 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v16/ext_index.json @@ -0,0 +1,7 @@ +{ + "public_extensions": [], + "library_index": { + "TODO": "We still need PG16 extensions" + }, + "extension_data": {} +} \ No newline at end of file diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 161662bc99..480caf6bf6 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -20,7 +20,7 @@ from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser from fixtures.types import Lsn @@ -151,6 +151,7 @@ def test_create_snapshot( shutil.copytree(test_output_dir, compatibility_snapshot_dir) +@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -208,6 +209,7 @@ def test_backward_compatibility( ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" +@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release") @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 5f5c733b76..775ad10241 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -9,7 +9,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, skip_on_postgres from fixtures.remote_storage import ( RemoteStorageKind, S3Storage, @@ -86,6 +86,7 @@ def upload_files(env): # Test downloading remote extension. +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_extensions( @@ -148,6 +149,7 @@ def test_remote_extensions( # Test downloading remote library. +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_library( @@ -206,6 +208,7 @@ def test_remote_library( # RemoteStorageKind.REAL_S3 not in available_s3_storages(), # reason="skipping test because real s3 not enabled", # ) +@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building") @pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_multiple_extensions_one_archive( neon_env_builder: NeonEnvBuilder, @@ -251,7 +254,8 @@ def test_extension_download_after_restart( neon_env_builder: NeonEnvBuilder, pg_version: PgVersion, ): - if "15" in pg_version: # SKIP v15 for now because test set only has extension built for v14 + # TODO: PG15 + PG16 extension building + if "v14" not in pg_version: # test set only has extension built for v14 return None neon_env_builder.enable_extensions_remote_storage(RemoteStorageKind.MOCK_S3) diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 new file mode 160000 index 0000000000..7a50f139c6 --- /dev/null +++ b/vendor/postgres-v16 @@ -0,0 +1 @@ +Subproject commit 7a50f139c6269454ab9260c7a9752874b9089943 diff --git a/vendor/revisions.json b/vendor/revisions.json index 63b72cf506..48b03a4d5e 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,4 +1,5 @@ { + "postgres-v16": "7a50f139c6269454ab9260c7a9752874b9089943", "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027", "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568" }