diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index a9b5025562..879891e7a3 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -1,7 +1,8 @@ #!/bin/sh -# get instance id from meta-data service +# fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) # store fqdn hostname in var HOST=$(hostname -f) @@ -14,7 +15,8 @@ cat <> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + shell: bash -euxo pipefail {0} # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache @@ -93,8 +121,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -106,14 +134,17 @@ jobs: - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: mold -run make postgres -j$(nproc) + shell: bash -euxo pipefail {0} - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + shell: bash -euxo pipefail {0} - name: Run cargo test run: | ${cov_prefix} cargo test $CARGO_FLAGS + shell: bash -euxo pipefail {0} - name: Install rust binaries run: | @@ -154,9 +185,11 @@ jobs: echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done fi + shell: bash -euxo pipefail {0} - name: Install postgres binaries run: cp -a tmp_install /tmp/neon/pg_install + shell: bash -euxo pipefail {0} - name: Upload Neon artifact uses: ./.github/actions/upload @@ -171,7 +204,9 @@ jobs: pg_regress-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -199,7 +234,9 @@ jobs: other-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -230,7 +267,9 @@ jobs: benchmarks: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -261,7 +300,9 @@ jobs: coverage-report: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -284,7 +325,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -300,6 +341,7 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + shell: bash -euxo pipefail {0} - name: Build and upload coverage report run: | @@ -332,9 +374,13 @@ jobs: \"description\": \"Coverage report is ready\", \"target_url\": \"$REPORT_URL\" }" + shell: bash -euxo pipefail {0} trigger-e2e-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] steps: - name: Set PR's status to pending and request a remote CI test @@ -369,150 +415,130 @@ jobs: } }" - docker-image: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} + neon-image: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v1 # v3 won't work with kaniko with: submodules: true fetch-depth: 0 - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker + - name: Kaniko build neon + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID - - name: Get build tag - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: build-tag + compute-tools-image: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug - - name: Get legacy build tag - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag - - - name: Build neon Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - pull: true - push: true - tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} - - docker-image-compute: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v1 # v3 won't work with kaniko + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute tools + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + + compute-node-image: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko with: submodules: true fetch-depth: 0 - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker + - name: Kaniko build compute node + working-directory: ./vendor/postgres/ + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID - - name: Get build tag + promote-images: + runs-on: dev + needs: [ neon-image, compute-tools-image, compute-node-image ] + if: github.event_name != 'workflow_dispatch' + container: amazon/aws-cli + strategy: + fail-fast: false + matrix: + name: [ neon, compute-tools, compute-node ] + + steps: + - name: Promote image to latest + run: + MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + + push-docker-hub: + runs-on: dev + needs: [ promote-images, tag ] + container: golang:1.19-bullseye + + steps: + - name: Install Crane & ECR helper run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: build-tag + go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + +# - name: Get build tag +# run: | +# if [[ "$GITHUB_REF_NAME" == "main" ]]; then +# echo "::set-output name=tag::$(git rev-list --count HEAD)" +# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then +# echo "::set-output name=tag::release-$(git rev-list --count HEAD)" +# else +# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " +# echo "::set-output name=tag::$GITHUB_RUN_ID" +# fi +# id: build-tag - - name: Get legacy build tag + - name: Configure ECR login run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - name: Build compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: false - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:local + - name: Pull neon image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon - - name: Push compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: true - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} + - name: Pull compute tools image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools - - name: Build compute-node Docker image - uses: docker/build-push-action@v2 - with: - context: ./vendor/postgres/ - build-args: - COMPUTE_TOOLS_TAG=local - push: true - tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + - name: Pull compute node image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node + + - name: Configure docker login + run: | + # ECR Credential Helper & Docker Hub don't work together in config, hence reset + echo "" > /github/home/.docker/config.json + crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io + + - name: Push neon image to Docker Hub + run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} + + - name: Push compute tools image to Docker Hub + run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + + - name: Push compute node image to Docker Hub + run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} + + - name: Add latest tag to images + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -538,14 +564,16 @@ jobs: deploy: runs-on: [ self-hosted, Linux, k8s-runner ] - # We need both storage **and** compute images for deploy, because control plane - # picks the compute version based on the storage version. If it notices a fresh - # storage it may bump the compute version. And if compute image failed to build - # it may break things badly. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -556,13 +584,19 @@ jobs: submodules: true fetch-depth: 0 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Setup ansible run: | + export PATH="/root/.local/bin:$PATH" pip install --progress-bar off --user ansible boto3 - name: Redeploy run: | - export DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} cd "$(pwd)/.github/ansible" if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -585,13 +619,16 @@ jobs: rm -f neon_install.tar.gz .neon_current_version deploy-proxy: - runs-on: [ self-hosted, Linux, k8s-runner ] - # Compute image isn't strictly required for proxy deploy, but let's still wait for it - # to run all deploy jobs consistently. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -604,6 +641,9 @@ jobs: submodules: true fetch-depth: 0 + - name: Add curl + run: apt update && apt install curl -y + - name: Store kubeconfig file run: | echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} @@ -616,6 +656,6 @@ jobs: - name: Re-deploy proxy run: | - DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index aa37167a19..6f13a38dea 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh diff --git a/Dockerfile b/Dockerfile index 6f017ac5d4..17aa0025e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ # Build Postgres -FROM neondatabase/rust:1.58 AS pg-build -WORKDIR /pg - -USER root +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build +WORKDIR /home/nonroot COPY vendor/postgres vendor/postgres COPY Makefile Makefile @@ -11,27 +9,30 @@ ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s postgres \ && rm -rf tmp_install/build \ - && tar -C tmp_install -czf /postgres_install.tar.gz . + && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries -FROM neondatabase/rust:1.58 AS build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build +WORKDIR /home/nonroot ARG GIT_VERSION=local # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY -COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +COPY --from=pg-build /home/nonroot/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build --release \ + && mold -run cargo build --release \ && cachepot -s # Build final image @@ -40,8 +41,8 @@ FROM debian:bullseye-slim WORKDIR /data RUN set -e \ - && apt-get update \ - && apt-get install -y \ + && apt update \ + && apt install -y \ libreadline-dev \ libseccomp-dev \ openssl \ @@ -50,17 +51,14 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=pg-build /pg/tmp_install/ /usr/local/ -COPY --from=pg-build /postgres_install.tar.gz /data/ - -COPY docker-entrypoint.sh /docker-entrypoint.sh +COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ +COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ VOLUME ["/data"] USER zenith EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["pageserver"] diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 76cbc2ac30..05393021c2 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,22 +1,25 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -FROM neondatabase/rust:1.58 AS rust-build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build +WORKDIR /home/nonroot # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. # cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY COPY . . RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \ + && mold -run cargo build -p compute_tools --release \ && cachepot -s # Final image that only has one binary -FROM debian:buster-slim +FROM debian:bullseye-slim -COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e0b409f32d..75e552f6cc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use crate::safekeeper::SafekeeperNode; // This data structures represents neon_local CLI config // // It is deserialized from the .neon/config file, or the config file passed -// to 'zenith init --config=' option. See control_plane/simple.conf for +// to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // #[serde_as] @@ -320,7 +320,7 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'zenith init' first", + "Zenith config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } @@ -337,12 +337,12 @@ impl LocalEnv { } pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'zenith init --config=' + // Currently, the user first passes a config file with 'neon_local init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line +# and safekeeeper node. It is read by the 'neon_local' command-line # utility. "# .to_string(); @@ -382,7 +382,7 @@ impl LocalEnv { } // - // Initialize a new Zenith repository + // Initialize a new Neon repository // pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 0cae479d71..652736058a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,5 +1,4 @@ use std::io::Write; -use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; use std::sync::Arc; @@ -52,7 +51,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, )) } @@ -241,37 +240,23 @@ impl SafekeeperNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if safekeeper flushes a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nSafekeeper connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port - match self.check_status() { - Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nSafekeeper status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; + if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2742e84bb..aab29628e3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,9 +1,8 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; -use std::net::TcpStream; use std::num::NonZeroU64; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Duration; use std::{io, result, thread}; @@ -103,23 +102,19 @@ impl PageServerNode { /// Construct libpq connection string for connecting to the pageserver. fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { - format!("postgresql://no_user:{}@{}/no_db", password, listen_addr) + format!("postgresql://no_user:{password}@{listen_addr}/no_db") .parse() .unwrap() } - pub fn init( + pub fn initialize( &self, create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], ) -> anyhow::Result { - let mut cmd = Command::new(self.env.pageserver_bin()?); - let id = format!("id={}", self.env.pageserver.id); - // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); @@ -139,67 +134,52 @@ impl PageServerNode { .collect::>() .join(",") ); - let mut args = Vec::with_capacity(20); - - args.push("--init"); - args.extend(["-D", &base_data_dir_param]); - args.extend(["-c", &pg_distrib_dir_param]); - args.extend(["-c", &authg_type_param]); - args.extend(["-c", &listen_http_addr_param]); - args.extend(["-c", &listen_pg_addr_param]); - args.extend(["-c", &broker_endpoints_param]); - args.extend(["-c", &id]); - let broker_etcd_prefix_param = self .env .etcd_broker .broker_etcd_prefix .as_ref() .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); - if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { - args.extend(["-c", broker_etcd_prefix_param]); - } - for config_override in config_overrides { - args.extend(["-c", config_override]); + let mut init_config_overrides = config_overrides.to_vec(); + init_config_overrides.push(&id); + init_config_overrides.push(&pg_distrib_dir_param); + init_config_overrides.push(&authg_type_param); + init_config_overrides.push(&listen_http_addr_param); + init_config_overrides.push(&listen_pg_addr_param); + init_config_overrides.push(&broker_endpoints_param); + + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + init_config_overrides.push(broker_etcd_prefix_param); } if self.env.pageserver.auth_type != AuthType::Trust { - args.extend([ - "-c", - "auth_validation_public_key_path='auth_public_key.pem'", - ]); + init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); } - let create_tenant = create_tenant.map(|id| id.to_string()); - if let Some(tenant_id) = create_tenant.as_deref() { - args.extend(["--create-tenant", tenant_id]) + self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; + let init_result = self + .try_init_timeline(create_tenant, initial_timeline_id) + .context("Failed to create initial tenant and timeline for pageserver"); + match &init_result { + Ok(initial_timeline_id) => { + println!("Successfully initialized timeline {initial_timeline_id}") + } + Err(e) => eprintln!("{e:#}"), } + self.stop(false)?; + init_result + } - let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - let initial_timeline_id_string = initial_timeline_id.to_string(); - args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - - let cmd_with_args = cmd.args(args); - let init_output = fill_rust_env_vars(cmd_with_args) - .output() - .with_context(|| { - format!("failed to init pageserver with command {:?}", cmd_with_args) - })?; - - if !init_output.status.success() { - bail!( - "init invocation failed, {}\nStdout: {}\nStderr: {}", - init_output.status, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr) - ); - } - - // echo the captured output of the init command - println!("{}", String::from_utf8_lossy(&init_output.stdout)); - - Ok(initial_timeline_id) + fn try_init_timeline( + &self, + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = + self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + Ok(initial_timeline_info.timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -211,15 +191,35 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - print!( + self.start_node(config_overrides, &self.repo_path(), false) + } + + fn start_node( + &self, + config_overrides: &[&str], + datadir: &Path, + update_config: bool, + ) -> anyhow::Result<()> { + println!( "Starting pageserver at '{}' in '{}'", connection_address(&self.pg_connection_config), - self.repo_path().display() + datadir.display() ); - io::stdout().flush().unwrap(); + io::stdout().flush()?; - let repo_path = self.repo_path(); - let mut args = vec!["-D", repo_path.to_str().unwrap()]; + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!( + "Datadir path '{}' cannot be represented as a unicode string", + datadir.display() + ) + })?, + ]; + + if update_config { + args.push("--update-config"); + } for config_override in config_overrides { args.extend(["-c", config_override]); @@ -231,8 +231,8 @@ impl PageServerNode { if !filled_cmd.status()?.success() { bail!( - "Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display() + "Pageserver failed to start. See console output and '{}' for details.", + datadir.join("pageserver.log").display() ); } @@ -241,7 +241,7 @@ impl PageServerNode { const RETRIES: i8 = 15; for retries in 1..RETRIES { match self.check_status() { - Ok(_) => { + Ok(()) => { println!("\nPageserver started"); return Ok(()); } @@ -255,21 +255,18 @@ impl PageServerNode { if retries == 5 { println!() // put a line break after dots for second message } - println!( - "Pageserver not responding yet, err {} retrying ({})...", - err, retries - ); + println!("Pageserver not responding yet, err {err} retrying ({retries})..."); } } PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {} ", msg) + bail!("pageserver failed to start: {msg} ") } } thread::sleep(Duration::from_secs(1)); } } } - bail!("pageserver failed to start in {} seconds", RETRIES); + bail!("pageserver failed to start in {RETRIES} seconds"); } /// @@ -299,51 +296,32 @@ impl PageServerNode { match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { - println!( - "Pageserver with pid {} does not exist, but a PID file was found", - pid - ); + println!("Pageserver with pid {pid} does not exist, but a PID file was found"); return Ok(()); } Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, + "Failed to send signal to pageserver with pid {pid}: {}", err.desc() ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if pageserver checkpoints a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nPageserver connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; - match self.check_status() { - Err(PageserverHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nPageserver status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); @@ -351,13 +329,13 @@ impl PageServerNode { thread::sleep(Duration::from_millis(100)); } - bail!("Failed to stop pageserver with pid {}", pid); + bail!("Failed to stop pageserver with pid {pid}"); } pub fn page_server_psql(&self, sql: &str) -> Vec { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); - println!("Pageserver query: '{}'", sql); + println!("Pageserver query: '{sql}'"); client.simple_query(sql).unwrap() } @@ -392,9 +370,8 @@ impl PageServerNode { &self, new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result> { - let tenant_id_string = self - .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + ) -> anyhow::Result { + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, checkpoint_distance: settings @@ -433,18 +410,16 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::>()?; - - tenant_id_string - .map(|id| { - id.parse().with_context(|| { - format!( - "Failed to parse tennat creation response as tenant id: {}", - id - ) + .json::>() + .with_context(|| { + format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") + })? + .context("No tenant id was found in the tenant creation response") + .and_then(|tenant_id_string| { + tenant_id_string.parse().with_context(|| { + format!("Failed to parse response string as tenant id: '{tenant_id_string}'") }) }) - .transpose() } pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { @@ -515,22 +490,27 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> anyhow::Result> { - let timeline_info_response = self - .http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) -> anyhow::Result { + self.http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id, + ancestor_start_lsn, + ancestor_timeline_id, + }) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse timeline creation response for tenant id: {tenant_id}") + })? + .with_context(|| { + format!( + "No timeline id was found in the timeline creation response for tenant {tenant_id}" ) - .json(&TimelineCreateRequest { - new_timeline_id, - ancestor_start_lsn, - ancestor_timeline_id, - }) - .send()? - .error_from_body()? - .json::>()?; - - Ok(timeline_info_response) + }) } /// Import a basebackup prepared using either: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh deleted file mode 100755 index 75dbdaed7a..0000000000 --- a/docker-entrypoint.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -eux - -pageserver_id_param="${NODE_ID:-10}" - -broker_endpoints_param="${BROKER_ENDPOINT:-absent}" -if [ "$broker_endpoints_param" != "absent" ]; then - broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" -else - broker_endpoints_param='' -fi - -remote_storage_param="${REMOTE_STORAGE:-}" - -if [ "$1" = 'pageserver' ]; then - if [ ! -d "/data/tenants" ]; then - echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param - fi - echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data -else - "$@" -fi diff --git a/libs/postgres_ffi/pg_control_ffi.h b/libs/postgres_ffi/bindgen_deps.h similarity index 100% rename from libs/postgres_ffi/pg_control_ffi.h rename to libs/postgres_ffi/bindgen_deps.h diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 7db2c20e34..69b2711c22 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -44,7 +44,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn main() { // Tell cargo to invalidate the built crate whenever the wrapper changes - println!("cargo:rerun-if-changed=pg_control_ffi.h"); + println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` @@ -88,9 +88,9 @@ fn main() { // the resulting bindings. let bindings = bindgen::Builder::default() // - // All the needed PostgreSQL headers are included from 'pg_control_ffi.h' + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' // - .header("pg_control_ffi.h") + .header("bindgen_deps.h") // // Tell cargo to invalidate the built crate whenever any of the // included header files changed. diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 4df2342b90..0918d15001 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -23,7 +23,7 @@ //! information. You can use PostgreSQL's pg_controldata utility to view its //! contents. //! -use crate::{ControlFileData, PG_CONTROL_FILE_SIZE}; +use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE}; use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 28d9a13dbf..022355329c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,21 +7,62 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] -use serde::{Deserialize, Serialize}; use utils::lsn::Lsn; -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +macro_rules! postgres_ffi { + ($version:ident) => { + #[path = "."] + pub mod $version { + // fixme: does this have to be 'pub'? + pub mod bindings { + // bindgen generates bindings for a lot of stuff we don't need + #![allow(dead_code)] -pub mod controlfile_utils; -pub mod nonrelfile_utils; -pub mod pg_constants; -pub mod relfile_utils; -pub mod waldecoder; -pub mod xlog_utils; + use serde::{Deserialize, Serialize}; + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + } + pub mod controlfile_utils; + pub mod nonrelfile_utils; + pub mod pg_constants; + pub mod relfile_utils; + pub mod waldecoder; + pub mod xlog_utils; + + // Re-export some symbols from bindings + pub use bindings::DBState_DB_SHUTDOWNED; + pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + } + }; +} + +postgres_ffi!(v14); + +// Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::{uint32, uint64, Oid}; +pub use v14::bindings::{BlockNumber, OffsetNumber}; +pub use v14::bindings::{MultiXactId, TransactionId}; + +// Likewise for these, although the assumption that these don't change is a little more iffy. +pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; + +// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and +// --with-segsize=SEGSIZE, but assume the defaults for now. +pub const BLCKSZ: u16 = 8192; +pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); +pub const XLOG_BLCKSZ: usize = 8192; + +// PG timeline is always 1, changing it doesn't have any useful meaning in Neon. +// +// NOTE: this is not to be confused with Neon timelines; different concept! +// +// It's a shaky assumption, that it's always 1. We might import a +// PostgreSQL data directory that has gone through timeline bumps, +// for example. FIXME later. +pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index b92207cd81..04ef346d88 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,11 +1,12 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::{pg_constants, transaction_id_precedes}; +use crate::transaction_id_precedes; +use super::pg_constants; use bytes::BytesMut; use log::*; -use crate::MultiXactId; +use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { trace!( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 7230b841f5..42b5c5d842 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -7,7 +7,8 @@ //! comments on them. //! -use crate::PageHeaderData; +use super::bindings::PageHeaderData; +use crate::BLCKSZ; // // From pg_tablespace_d.h @@ -31,11 +32,6 @@ pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; -// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and -// --with-segsize=SEGSIZE, but assume the defaults for now. -pub const BLCKSZ: u16 = 8192; -pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); - // // From bufpage.h // @@ -213,7 +209,6 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; /* FIXME: pageserver should request wal_seg_size from compute node */ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; -pub const XLOG_BLCKSZ: usize = 8192; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index 94498ee9a9..f3476acc9c 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,11 +1,11 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use crate::pg_constants; +use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; -#[derive(Debug, Clone, thiserror::Error, PartialEq)] +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] InvalidForkName, diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index cbb761236c..0e1c9567cb 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -10,10 +10,7 @@ //! use super::pg_constants; use super::xlog_utils::*; -use super::XLogLongPageHeaderData; -use super::XLogPageHeaderData; -use super::XLogRecord; -use super::XLOG_PAGE_MAGIC; +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 29b00c8d36..e7838c3f2c 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -7,22 +7,24 @@ // have been named the same as the corresponding PostgreSQL functions instead. // -use crate::pg_constants; -use crate::CheckPoint; -use crate::FullTransactionId; -use crate::XLogLongPageHeaderData; -use crate::XLogPageHeaderData; -use crate::XLogRecord; -use crate::XLOG_PAGE_MAGIC; +use crc32c::crc32c_append; -use crate::pg_constants::WAL_SEGMENT_SIZE; -use crate::waldecoder::WalStreamDecoder; +use super::bindings::{ + CheckPoint, FullTransactionId, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, + XLOG_PAGE_MAGIC, +}; +use super::pg_constants; +use super::pg_constants::WAL_SEGMENT_SIZE; +use crate::v14::waldecoder::WalStreamDecoder; +use crate::PG_TLI; +use crate::{uint32, uint64, Oid}; use bytes::BytesMut; use bytes::{Buf, Bytes}; use log::*; +use serde::Serialize; use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; @@ -47,9 +49,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; -// PG timeline is always 1, changing it doesn't have useful meaning in Zenith. -pub const PG_TLI: u32 = 1; - pub type XLogRecPtr = u64; pub type TimeLineID = u32; pub type TimestampTz = i64; @@ -80,12 +79,12 @@ pub fn XLogSegNoOffsetToRecPtr( #[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { - return format!( + format!( "{:>08X}{:>08X}{:>08X}", tli, logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) - ); + ) } #[allow(non_snake_case)] @@ -346,6 +345,85 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result Bytes { + use utils::bin_ser::LeSer; + self.ser().unwrap().into() + } +} + +/// Create new WAL record for non-transactional logical message. +/// Used for creating artificial WAL for tests, as LogicalMessage +/// record is basically no-op. +/// +/// NOTE: This leaves the xl_prev field zero. The safekeeper and +/// pageserver tolerate that, but PostgreSQL does not. +pub fn encode_logical_message(prefix: &str, message: &str) -> Vec { + let mut prefix_bytes: Vec = Vec::with_capacity(prefix.len() + 1); + prefix_bytes.write_all(prefix.as_bytes()).unwrap(); + prefix_bytes.push(0); + + let message_bytes = message.as_bytes(); + + let logical_message = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as u64, + message_size: message_bytes.len() as u64, + }; + + let mainrdata = logical_message.encode(); + let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); + // only short mainrdata is supported for now + assert!(mainrdata_len <= 255); + let mainrdata_len = mainrdata_len as u8; + + let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; + data.extend_from_slice(&mainrdata); + data.extend_from_slice(&prefix_bytes); + data.extend_from_slice(message_bytes); + + let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len(); + + let mut header = XLogRecord { + xl_tot_len: total_len as u32, + xl_xid: 0, + xl_prev: 0, + xl_info: 0, + xl_rmid: 21, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: 0, // crc will be calculated later + }; + + let header_bytes = header.encode().expect("failed to encode header"); + let crc = crc32c_append(0, &data); + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut wal: Vec = Vec::new(); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); + wal.extend_from_slice(&data); + + // WAL start position must be aligned at 8 bytes, + // this will add padding for the next WAL record. + const PADDING: usize = 8; + let padding_rem = wal.len() % PADDING; + if padding_rem != 0 { + wal.resize(wal.len() + PADDING - padding_rem, 0); + } + + wal +} + #[cfg(test)] mod tests { use super::*; @@ -547,4 +625,15 @@ mod tests { checkpoint.update_next_xid(1024); assert_eq!(checkpoint.nextXid.value, 2048); } + + #[test] + pub fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, + 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix", "message"); + assert_eq!(expected, actual[..]); + } } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index e3b666da41..6ac5afb27f 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,8 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; -use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE; -use postgres_ffi::xlog_utils::{ +use postgres_ffi::v14::pg_constants::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::{ XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; use std::cmp::Ordering; diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 70f54ea02f..42b45eeea0 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -265,7 +265,7 @@ mod tests { use serde::{Deserialize, Serialize}; use std::io::Cursor; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, b: u32, @@ -286,7 +286,7 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, pub blockpos: u32, diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8e3d357397..4984d695fd 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -10,12 +10,10 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => { - return Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name - ))) - } + None => Err(ApiError::BadRequest(format!( + "no {} specified in path param", + param_name + ))), } } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 3dab2a625c..1090f4c679 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -18,7 +18,7 @@ pub const XLOG_BLCKSZ: u32 = 8192; pub struct Lsn(pub u64); /// We tried to parse an LSN from a string, but failed -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("LsnParseError")] pub struct LsnParseError; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 79dca96fcf..4d873bd5ac 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -50,7 +50,7 @@ pub trait Handler { /// PostgresBackend protocol state. /// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, PartialOrd)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { Initialization, Encrypted, diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 3dcae4d0af..3f14acd50d 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -930,7 +930,7 @@ impl<'a> BeMessage<'a> { // Neon extension of postgres replication protocol // See NEON_STATUS_UPDATE_TAG_BYTE -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index bc32f51b13..a531975d60 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -9,7 +9,7 @@ use std::sync::Mutex; use std::time::Duration; /// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index f357837a55..b995b61b78 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use std::io::Read; use utils::bin_ser::LeSer; -#[derive(Debug, PartialEq, Deserialize)] +#[derive(Debug, PartialEq, Eq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 907ef98aec..248400c2c1 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -30,6 +30,9 @@ static CERT: Lazy = Lazy::new(|| { }); #[test] +// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), +// we resize the vector so doing some modifications after all +#[allow(clippy::read_zero_byte_vec)] fn ssl() { let (mut client_sock, server_sock) = make_tcp_pair(); diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index c4dd52e183..78a465539a 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -501,10 +501,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); - // Call 'pageserver init'. + // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); let initial_timeline_id = pageserver - .init( + .initialize( Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), @@ -551,25 +551,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .values_of("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver - .tenant_create(initial_tenant_id, tenant_conf)? - .ok_or_else(|| { - anyhow!("Tenant with id {:?} was already created", initial_tenant_id) - })?; - println!( - "tenant {} successfully created on the pageserver", - new_tenant_id - ); + let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + println!("tenant {new_tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline = pageserver - .timeline_create(new_tenant_id, new_timeline_id, None, None)? - .context(format!( - "Failed to create initial timeline for tenant {new_tenant_id}" - ))?; - let new_timeline_id = timeline.timeline_id; - let last_record_lsn = timeline + let timeline_info = + pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let new_timeline_id = timeline_info.timeline_id; + let last_record_lsn = timeline_info .local .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? .last_record_lsn; @@ -616,20 +606,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline = pageserver - .timeline_create(tenant_id, None, None, None)? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}", - timeline.timeline_id, last_record_lsn, tenant_id, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", + timeline_info.timeline_id ); } Some(("import", import_match)) => { @@ -680,10 +668,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) .ok_or_else(|| { - anyhow!( - "Found no timeline id for branch name '{}'", - ancestor_branch_name - ) + anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; let start_lsn = branch_match @@ -691,12 +676,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(Lsn::from_str) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline = pageserver - .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + )?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; @@ -704,11 +692,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", - timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", + timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), None => bail!("no tenant subcommand provided"), } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 5837447ce8..33f072553f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -24,8 +24,13 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::*; -use postgres_ffi::*; + +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; +use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::TransactionId; +use postgres_ffi::PG_TLI; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE}; use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, @@ -200,7 +205,7 @@ where } // Add a file for each chunk of blocks (aka segment) - let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize); + let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); for (seg, blocks) in chunks.into_iter().enumerate() { let mut segment_data: Vec = vec![]; for blknum in blocks { @@ -220,23 +225,19 @@ where fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; - let mut slru_buf: Vec = - Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); + let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { let img = self .timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; if slru == SlruKind::Clog { - ensure!( - img.len() == pg_constants::BLCKSZ as usize - || img.len() == pg_constants::BLCKSZ as usize + 8 - ); + ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); } else { - ensure!(img.len() == pg_constants::BLCKSZ as usize); + ensure!(img.len() == BLCKSZ as usize); } - slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]); + slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b539964414..1a13147f42 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,6 @@ //! Main entry point for the Page Server executable. -use std::{env, path::Path, str::FromStr}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -13,7 +13,7 @@ use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, - timelines, virtual_file, LOG_FILE_NAME, + virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, @@ -24,7 +24,6 @@ use utils::{ shutdown::exit_now, signals::{self, Signal}, tcp_listener, - zid::{ZTenantId, ZTimelineId}, }; project_git_version!(GIT_VERSION); @@ -42,6 +41,7 @@ fn main() -> anyhow::Result<()> { .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( + Arg::new("daemonize") .short('d') .long("daemonize") @@ -52,7 +52,7 @@ fn main() -> anyhow::Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), + .help("Initialize pageserver with all given config overrides"), ) .arg( Arg::new("workdir") @@ -61,20 +61,6 @@ fn main() -> anyhow::Result<()> { .takes_value(true) .help("Working directory for the pageserver"), ) - .arg( - Arg::new("create-tenant") - .long("create-tenant") - .takes_value(true) - .help("Create tenant during init") - .requires("init"), - ) - .arg( - Arg::new("initial-timeline-id") - .long("initial-timeline-id") - .takes_value(true) - .help("Use a specific timeline id during init and tenant creation") - .requires("create-tenant"), - ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -85,6 +71,9 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg(Arg::new("update-config").long("update-config").takes_value(false).help( + "Update the config file when started", + )) .arg( Arg::new("enabled-features") .long("enabled-features") @@ -110,18 +99,6 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; let cfg_file_path = workdir.join("pageserver.toml"); - let init = arg_matches.is_present("init"); - let create_tenant = arg_matches - .value_of("create-tenant") - .map(ZTenantId::from_str) - .transpose() - .context("Failed to parse tenant id from the arguments")?; - let initial_timeline_id = arg_matches - .value_of("initial-timeline-id") - .map(ZTimelineId::from_str) - .transpose() - .context("Failed to parse timeline id from the arguments")?; - // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { format!( @@ -131,30 +108,86 @@ fn main() -> anyhow::Result<()> { })?; let daemonize = arg_matches.is_present("daemonize"); - if init && daemonize { - bail!("--daemonize cannot be used with --init") - } - let mut toml = if init { - // We're initializing the repo, so there's no config file yet - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")? - } else { - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) - .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; - cfg_file_contents - .parse::() - .with_context(|| { - format!( - "Failed to read '{}' as pageserver config", - cfg_file_path.display() - ) - })? + let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { + ControlFlow::Continue(conf) => conf, + ControlFlow::Break(()) => { + info!("Pageserver config init successful"); + return Ok(()); + } + }; + + let tenants_path = conf.tenants_path(); + if !tenants_path.exists() { + utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + format!( + "Failed to create tenants root dir at '{}'", + tenants_path.display() + ) + })?; + } + + // Initialize up failpoints support + let scenario = FailScenario::setup(); + + // Basic initialization of things that don't change after startup + virtual_file::init(conf.max_file_descriptors); + page_cache::init(conf.page_cache_size); + + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + + scenario.teardown(); + Ok(()) +} + +fn initialize_config( + cfg_file_path: &Path, + arg_matches: clap::ArgMatches, + workdir: &Path, +) -> anyhow::Result> { + let init = arg_matches.is_present("init"); + let update_config = init || arg_matches.is_present("update-config"); + + let (mut toml, config_file_exists) = if cfg_file_path.is_file() { + if init { + anyhow::bail!( + "Config file '{}' already exists, cannot init it, use --update-config to update it", + cfg_file_path.display() + ); + } + // Supplement the CLI arguments with the config file + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + format!( + "Failed to read pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + ( + cfg_file_contents + .parse::() + .with_context(|| { + format!( + "Failed to parse '{}' as pageserver config", + cfg_file_path.display() + ) + })?, + true, + ) + } else if cfg_file_path.exists() { + anyhow::bail!( + "Config file '{}' exists but is not a regular file", + cfg_file_path.display() + ); + } else { + // We're initializing the repo, so there's no config file yet + ( + DEFAULT_CONFIG_FILE + .parse::() + .context("could not parse built-in config file")?, + false, + ) }; - // Process any extra options given with -c if let Some(values) = arg_matches.values_of("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -165,49 +198,38 @@ fn main() -> anyhow::Result<()> { })?; for (key, item) in doc.iter() { - if key == "id" { - anyhow::ensure!( - init, - "node id can only be set during pageserver init and cannot be overridden" - ); + if config_file_exists && update_config && key == "id" && toml.contains_key(key) { + anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display()); } toml.insert(key, item.clone()); } } } - trace!("Resulting toml: {}", toml); - let conf = PageServerConf::parse_and_validate(&toml, &workdir) + + debug!("Resulting toml: {toml}"); + let conf = PageServerConf::parse_and_validate(&toml, workdir) .context("Failed to parse pageserver configuration")?; - // The configuration is all set up now. Turn it into a 'static - // that can be freely stored in structs and passed across threads - // as a ref. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + if update_config { + info!("Writing pageserver config to '{}'", cfg_file_path.display()); - // Initialize up failpoints support - let scenario = FailScenario::setup(); - - // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); - page_cache::init(conf.page_cache_size); - - // Create repo and exit if init was requested - if init { - timelines::init_pageserver(conf, create_tenant, initial_timeline_id) - .context("Failed to init pageserver")?; - // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( - "Failed to initialize pageserver config at '{}'", + "Failed to write pageserver config to '{}'", cfg_file_path.display() ) })?; - } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + info!( + "Config successfully written to '{}'", + cfg_file_path.display() + ) } - scenario.teardown(); - Ok(()) + Ok(if init { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(Box::leak(Box::new(conf))) + }) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1b1b4f99cb..1d0adec63d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,14 +11,13 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::metadata::TimelineMetadata; +use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{LocalTimelineState, RepositoryTimeline}; use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; -use crate::TimelineImpl; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ auth::JwtAuth, @@ -86,7 +85,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &TimelineImpl, + timeline: &LayeredTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -161,13 +160,13 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo } fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( - &*timeline, + timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, ), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 7d1e8e43aa..729829c5e8 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -15,13 +15,24 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::*; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::relfile_utils::*; +use postgres_ffi::v14::waldecoder::*; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::Oid; -use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; +// Returns checkpoint LSN from controlfile +pub fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} + /// /// Import all relation data pages from local disk into the repository. /// @@ -110,8 +121,8 @@ fn import_rel( let mut buf: [u8; 8192] = [0u8; 8192]; - ensure!(len % pg_constants::BLCKSZ as usize == 0); - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); + let nblocks = len / BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, @@ -120,7 +131,7 @@ fn import_rel( forknum, }; - let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32); // Call put_rel_creation for every segment of the relation, // because there is no guarantee about the order in which we are processing segments. @@ -144,8 +155,7 @@ fn import_rel( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - let relative_blknum = - blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32); ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } @@ -184,8 +194,8 @@ fn import_slru( .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; - ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / BLCKSZ as usize; ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index da213704f3..64024a2d8d 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -1,5 +1,5 @@ use crate::repository::{key_range_size, singleton_range, Key}; -use postgres_ffi::pg_constants; +use postgres_ffi::BLCKSZ; use std::ops::Range; /// @@ -19,7 +19,7 @@ impl KeySpace { /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as usize; let mut parts = Vec::new(); let mut current_part = Vec::new(); diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 8e75cbd4a2..6bf2e71852 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -59,7 +59,9 @@ mod storage_layer; mod timeline; use storage_layer::Layer; -use timeline::{LayeredTimeline, LayeredTimelineEntry}; +use timeline::LayeredTimelineEntry; + +pub use timeline::LayeredTimeline; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index dc8d7a2ad3..c130a42a8e 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -209,7 +209,7 @@ where reader: R, } -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VisitDirection { Forwards, Backwards, diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 2d396024a0..6ef4915bdb 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,6 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; +use metrics::core::{AtomicU64, GenericCounter}; use once_cell::sync::Lazy; use tracing::*; @@ -44,7 +45,7 @@ use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, @@ -223,6 +224,70 @@ impl From for RepositoryTimeline { } } +struct TimelineMetrics { + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, +} + +impl TimelineMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + + TimelineMetrics { + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + } + } +} + pub struct LayeredTimeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -269,14 +334,7 @@ pub struct LayeredTimeline { ancestor_lsn: Lsn, // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - current_physical_size_gauge: UIntGauge, + metrics: TimelineMetrics, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_layers: AtomicBool, @@ -426,7 +484,7 @@ impl Timeline for LayeredTimeline { "wait_lsn called by WAL receiver thread" ); - self.wait_lsn_time_histo.observe_closure_duration( + self.metrics.wait_lsn_time_histo.observe_closure_duration( || self.last_record_lsn .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) .with_context(|| { @@ -468,7 +526,8 @@ impl Timeline for LayeredTimeline { self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - self.reconstruct_time_histo + self.metrics + .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } @@ -530,7 +589,7 @@ impl Timeline for LayeredTimeline { } fn get_physical_size(&self) -> u64 { - self.current_physical_size_gauge.get() + self.metrics.current_physical_size_gauge.get() } fn get_physical_size_non_incremental(&self) -> anyhow::Result { @@ -604,43 +663,6 @@ impl LayeredTimeline { walredo_mgr: Arc, upload_layers: bool, ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let mut result = LayeredTimeline { conf, tenant_conf, @@ -663,14 +685,7 @@ impl LayeredTimeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - current_physical_size_gauge, + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), upload_layers: AtomicBool::new(upload_layers), @@ -706,6 +721,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); let mut num_layers = 0; + let timer = self.metrics.load_layer_map_histo.start_timer(); + // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); @@ -777,7 +794,11 @@ impl LayeredTimeline { "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); - self.current_physical_size_gauge.set(total_physical_size); + self.metrics + .current_physical_size_gauge + .set(total_physical_size); + + timer.stop_and_record(); Ok(()) } @@ -808,12 +829,16 @@ impl LayeredTimeline { } } + let timer = self.metrics.init_logical_size_histo.start_timer(); + // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; self.current_logical_size .store(logical_size as isize, AtomicOrdering::SeqCst); debug!("calculated logical size the hard way: {}", logical_size); + + timer.stop_and_record(); Ok(()) } @@ -878,7 +903,7 @@ impl LayeredTimeline { ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); + self.metrics.materialized_page_cache_hit_counter.inc_by(1); return Ok(()); } if prev_lsn <= cont_lsn { @@ -1074,7 +1099,7 @@ impl LayeredTimeline { fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); - self.last_record_gauge.set(new_lsn.0 as i64); + self.metrics.last_record_gauge.set(new_lsn.0 as i64); self.last_record_lsn.advance(new_lsn); } @@ -1178,7 +1203,7 @@ impl LayeredTimeline { } }; - let timer = self.flush_time_histo.start_timer(); + let timer = self.metrics.flush_time_histo.start_timer(); loop { let layers = self.layers.read().unwrap(); @@ -1349,7 +1374,7 @@ impl LayeredTimeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.current_physical_size_gauge.add(sz); + self.metrics.current_physical_size_gauge.add(sz); // update metrics NUM_PERSISTENT_FILES_CREATED.inc_by(1); PERSISTENT_BYTES_WRITTEN.inc_by(sz); @@ -1418,7 +1443,7 @@ impl LayeredTimeline { } // 3. Compact - let timer = self.compact_time_histo.start_timer(); + let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size)?; timer.stop_and_record(); } @@ -1494,7 +1519,7 @@ impl LayeredTimeline { lsn: Lsn, force: bool, ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); + let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); let mut layer_paths_to_upload = HashSet::new(); for partition in partitioning.parts.iter() { @@ -1538,7 +1563,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); for l in image_layers { - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(l.path().metadata()?.len()); layers.insert_historic(Arc::new(l)); } @@ -1788,7 +1814,8 @@ impl LayeredTimeline { let new_delta_path = l.path(); // update the timeline's physical size - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(new_delta_path.metadata()?.len()); new_layer_paths.insert(new_delta_path); @@ -1801,7 +1828,9 @@ impl LayeredTimeline { drop(all_keys_iter); for l in deltas_to_compact { if let Some(path) = l.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_do_delete.insert(path); } l.delete()?; @@ -2058,7 +2087,9 @@ impl LayeredTimeline { let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { if let Some(path) = doomed_layer.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_to_delete.insert(path); } doomed_layer.delete()?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 140260e0d0..47fd8a84cf 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -28,7 +28,6 @@ use tracing::info; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; -use layered_repository::LayeredRepository; use pgdatadir_mapping::DatadirTimeline; /// Current storage format version @@ -62,9 +61,6 @@ pub enum CheckpointConfig { Forced, } -pub type RepositoryImpl = LayeredRepository; -pub type TimelineImpl = ::Timeline; - pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 716df0f749..818eaf1b8f 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -83,7 +83,7 @@ pub fn get() -> &'static PageCache { } } -pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize; +pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize; const MAX_USAGE_COUNT: u8 = 5; /// diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3c5ea5267e..b63bb90be1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -40,9 +40,10 @@ use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData enum PagestreamFeMessage { @@ -725,10 +726,9 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let total_blocks = - timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64; + let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { db_size, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 113f40302a..88fac0ad5a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,8 +13,10 @@ use crate::repository::*; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::xlog_utils::TimestampTz; -use postgres_ffi::{pg_constants, Oid, TransactionId}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::TimestampTz; +use postgres_ffi::BLCKSZ; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; @@ -297,9 +299,9 @@ pub trait DatadirTimeline: Timeline { let clog_page = self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; - if clog_page.len() == pg_constants::BLCKSZ as usize + 8 { + if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]); + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); if timestamp >= search_timestamp { @@ -382,7 +384,7 @@ pub trait DatadirTimeline: Timeline { total_size += relsize as usize; } } - Ok(total_size * pg_constants::BLCKSZ as usize) + Ok(total_size * BLCKSZ as usize) } /// @@ -912,7 +914,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); self.pending_nblocks = 0; } @@ -940,7 +942,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); } Ok(()) @@ -1014,7 +1016,7 @@ struct SlruSegmentDirectory { segments: HashSet, } -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]); +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // Layout of the Key address space // diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index fadd41f547..e3d08f8b3d 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,9 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{pg_constants, Oid}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::Oid; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index a1a08b11d5..d09b01437c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -412,7 +412,6 @@ pub mod repo_harness { use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; - use crate::RepositoryImpl; use crate::{ config::PageServerConf, layered_repository::LayeredRepository, @@ -508,11 +507,11 @@ pub mod repo_harness { }) } - pub fn load(&self) -> RepositoryImpl { + pub fn load(&self) -> LayeredRepository { self.try_load().expect("failed to load test repo") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); let repo = LayeredRepository::new( diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 5a5cea9a4b..64f1caa542 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,16 +3,14 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository}; +use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline}; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; -use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; @@ -96,13 +94,13 @@ mod tenants_state { struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, + repo: Arc, /// Timelines, located locally in the pageserver's datadir. /// Timelines can entirely be removed entirely by the `detach` operation only. /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap::Timeline>>, + local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -179,7 +177,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc<::Timeline>, + datadir: Arc, }, } @@ -285,10 +283,8 @@ pub fn create_tenant_repository( conf, tenant_conf, tenant_id, - CreateRepo::Real { - wal_redo_manager, - remote_index, - }, + wal_redo_manager, + remote_index, )?; v.insert(Tenant { state: TenantState::Idle, @@ -369,7 +365,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Ok(()) } -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -383,7 +379,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -488,9 +484,9 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any } fn load_local_timeline( - repo: &RepositoryImpl, + repo: &LayeredRepository, timeline_id: ZTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; @@ -634,7 +630,7 @@ fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 6002e8b2d9..0d35195691 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,7 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; -use postgres_ffi::ControlFileData; + use std::{ fs, path::Path, @@ -13,18 +13,21 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, logging, + crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::import_datadir; use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, RepositoryImpl, TimelineImpl, + tenant_config::TenantConfOpt, +}; +use crate::{ + layered_repository::{LayeredRepository, LayeredTimeline}, + walredo::WalRedoManager, }; -use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; use crate::{repository::Timeline, CheckpointConfig}; #[derive(Debug, Clone, Copy)] @@ -33,69 +36,13 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn init_pageserver( - conf: &'static PageServerConf, - create_tenant: Option, - initial_timeline_id: Option, -) -> anyhow::Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - if let Some(tenant_id) = create_tenant { - println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy) - .context("failed to create repo")?; - let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) - .context("failed to create initial timeline")?; - println!("initial timeline {} created", new_timeline_id) - } else if initial_timeline_id.is_some() { - println!("Ignoring initial timeline parameter, due to no tenant id to create given"); - } - - println!("pageserver init succeeded"); - Ok(()) -} - -pub enum CreateRepo { - Real { - wal_redo_manager: Arc, - remote_index: RemoteIndex, - }, - Dummy, -} - pub fn create_repo( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - create_repo: CreateRepo, -) -> Result> { - let (wal_redo_manager, remote_index) = match create_repo { - CreateRepo::Real { - wal_redo_manager, - remote_index, - } => (wal_redo_manager, remote_index), - CreateRepo::Dummy => { - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - - (wal_redo_manager as _, RemoteIndex::default()) - } - }; - + wal_redo_manager: Arc, + remote_index: RemoteIndex, +) -> Result> { let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), @@ -122,16 +69,6 @@ pub fn create_repo( ))) } -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -181,7 +118,7 @@ fn bootstrap_timeline( run_initdb(conf, &initdb_path)?; let pgdata_path = initdb_path; - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. @@ -223,7 +160,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index b8064849e0..05afe4ba3e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,8 +22,8 @@ //! bespoken Rust code. use anyhow::Context; -use postgres_ffi::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; +use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; @@ -33,10 +33,12 @@ use tracing::*; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; -use postgres_ffi::{pg_constants, CheckPoint}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); @@ -293,7 +295,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { // Extract page image from FPI record let img_len = blk.bimg_len as usize; let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize); + let mut image = BytesMut::with_capacity(BLCKSZ as usize); image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); if blk.hole_length != 0 { @@ -309,7 +311,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { if !page_is_new(&image) { page_set_lsn(&mut image, lsn) } - assert_eq!(image.len(), pg_constants::BLCKSZ as usize); + assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { let rec = ZenithWalRecord::Postgres { @@ -1033,7 +1035,8 @@ mod tests { use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; use crate::repository::Timeline; - use postgres_ffi::pg_constants; + use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; + use postgres_ffi::RELSEG_SIZE; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1322,7 +1325,7 @@ mod tests { let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { + for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); @@ -1332,31 +1335,22 @@ mod tests { assert_current_logical_size(&*tline, Lsn(lsn)); - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE + 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE + 1); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE - 1); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 2722bc7320..e8e0a7c52b 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,6 +16,7 @@ use std::{ time::Duration, }; +use crate::{layered_repository::LayeredTimeline, repository::Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -25,12 +26,7 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{ - exponential_backoff, - repository::{Repository, Timeline}, - DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use crate::{RepositoryImpl, TimelineImpl}; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -43,7 +39,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -100,6 +96,8 @@ async fn connection_manager_loop_step( info!("Subscribed for etcd timeline changes, waiting for new etcd data"); loop { + let time_until_next_retry = walreceiver_state.time_until_next_retry(); + select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -114,27 +112,23 @@ async fn connection_manager_loop_step( } => { let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => { - *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; - }, + TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { - if status.has_received_wal { - // Reset connection attempts here only, we know that safekeeper is healthy - // because it can send us a WAL update. - walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = status; }, TaskEvent::End(end_result) => { match end_result { Ok(()) => debug!("WAL receiving task finished"), - Err(e) => { - warn!("WAL receiving task failed: {e}"); - // If the task failed, set the connection attempts to at least 1, to try other safekeepers. - let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1); - } + Err(e) => warn!("WAL receiving task failed: {e}"), }; - walreceiver_state.wal_connection = None; + walreceiver_state.drop_old_connection(false).await; }, } }, @@ -158,6 +152,8 @@ async fn connection_manager_loop_step( } } }, + + _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} } // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. @@ -238,11 +234,15 @@ async fn subscribe_for_timeline_updates( } } +const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1; +const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; +const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; + /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -251,7 +251,8 @@ struct WalreceiverState { max_lsn_wal_lag: NonZeroU64, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, - wal_connection_attempts: HashMap, + /// Info about retries and unsuccessful attempts to connect to safekeepers. + wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, } @@ -259,6 +260,8 @@ struct WalreceiverState { /// Current connection data. #[derive(Debug)] struct WalConnection { + /// Time when the connection was initiated. + started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, /// Status of the connection. @@ -278,6 +281,12 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } +#[derive(Debug)] +struct RetryInfo { + next_retry_at: Option, + retry_duration_seconds: f64, +} + /// Data about the timeline to connect to, received from etcd. #[derive(Debug)] struct EtcdSkTimeline { @@ -291,7 +300,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc<::Timeline>, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -304,31 +313,18 @@ impl WalreceiverState { max_lsn_wal_lag, wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) { - if let Some(old_connection) = self.wal_connection.take() { - old_connection.connection_task.shutdown().await - } + self.drop_old_connection(true).await; let id = self.id; let connect_timeout = self.wal_connect_timeout; - let connection_attempt = self - .wal_connection_attempts - .get(&new_sk_id) - .copied() - .unwrap_or(0); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { - exponential_backoff( - connection_attempt, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ) - .await; super::walreceiver_connection::handle_walreceiver_connection( id, &new_wal_source_connstr, @@ -344,10 +340,11 @@ impl WalreceiverState { let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { + started_at: now, sk_id: new_sk_id, status: WalConnectionStatus { is_connected: false, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: now, latest_wal_update: now, streaming_lsn: None, @@ -358,6 +355,71 @@ impl WalreceiverState { }); } + /// Drops the current connection (if any) and updates retry timeout for the next + /// connection attempt to the same safekeeper. + async fn drop_old_connection(&mut self, needs_shutdown: bool) { + let wal_connection = match self.wal_connection.take() { + Some(wal_connection) => wal_connection, + None => return, + }; + + if needs_shutdown { + wal_connection.connection_task.shutdown().await; + } + + let retry = self + .wal_connection_retries + .entry(wal_connection.sk_id) + .or_insert(RetryInfo { + next_retry_at: None, + retry_duration_seconds: WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS, + }); + + let now = Utc::now().naive_utc(); + + // Schedule the next retry attempt. We want to have exponential backoff for connection attempts, + // and we add backoff to the time when we started the connection attempt. If the connection + // was active for a long time, then next_retry_at will be in the past. + retry.next_retry_at = + wal_connection + .started_at + .checked_add_signed(chrono::Duration::milliseconds( + (retry.retry_duration_seconds * 1000.0) as i64, + )); + + if let Some(next) = &retry.next_retry_at { + if next > &now { + info!( + "Next connection retry to {:?} is at {}", + wal_connection.sk_id, next + ); + } + } + + let next_retry_duration = + retry.retry_duration_seconds * WALCONNECTION_RETRY_BACKOFF_MULTIPLIER; + // Clamp the next retry duration to the maximum allowed. + let next_retry_duration = next_retry_duration.min(WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS); + // Clamp the next retry duration to the minimum allowed. + let next_retry_duration = next_retry_duration.max(WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS); + + retry.retry_duration_seconds = next_retry_duration; + } + + /// Returns time needed to wait to have a new candidate for WAL streaming. + fn time_until_next_retry(&self) -> Option { + let now = Utc::now().naive_utc(); + + let next_retry_at = self + .wal_connection_retries + .values() + .filter_map(|retry| retry.next_retry_at) + .filter(|next_retry_at| next_retry_at > &now) + .min(); + + next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + } + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { match self @@ -551,52 +613,37 @@ impl WalreceiverState { /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: - /// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node) - /// * has greatest data Lsn among the ones that are left - /// - /// NOTE: - /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but - /// otherwise to reset the connection attempts, a successful connection to that node is needed. - /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored. + /// * has no pending retry cooldown + /// * has greatest commit_lsn among the ones that are left fn select_connection_candidate( &self, node_to_omit: Option, ) -> Option<(NodeId, &SkTimelineInfo, String)> { - let all_candidates = self - .applicable_connection_candidates() + self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) - .collect::>(); - - let smallest_attempts_allowed = all_candidates - .iter() - .map(|(sk_id, _, _)| { - self.wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) - .min()?; - - all_candidates - .into_iter() - .filter(|(sk_id, _, _)| { - smallest_attempts_allowed - >= self - .wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) .max_by_key(|(_, info, _)| info.commit_lsn) } /// Returns a list of safekeepers that have valid info and ready for connection. + /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, ) -> impl Iterator { + let now = Utc::now().naive_utc(); + self.wal_stream_candidates .iter() .filter(|(_, info)| info.timeline.commit_lsn.is_some()) + .filter(move |(sk_id, _)| { + let next_retry_at = self + .wal_connection_retries + .get(sk_id) + .and_then(|retry_info| { + retry_info.next_retry_at + }); + + next_retry_at.is_none() || next_retry_at.unwrap() <= now + }) .filter_map(|(sk_id, etcd_info)| { let info = &etcd_info.timeline; match wal_stream_connection_string( @@ -631,7 +678,7 @@ impl WalreceiverState { }); for node_id in node_ids_to_remove { - self.wal_connection_attempts.remove(&node_id); + self.wal_connection_retries.remove(&node_id); } } } @@ -688,7 +735,6 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::{ repo_harness::{RepoHarness, TIMELINE_ID}, Repository, @@ -793,7 +839,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), @@ -802,6 +848,7 @@ mod tests { state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1021,7 +1068,13 @@ mod tests { }, ), ]); - state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]); + state.wal_connection_retries = HashMap::from([( + NodeId(0), + RetryInfo { + next_retry_at: now.checked_add_signed(chrono::Duration::hours(1)), + retry_duration_seconds: WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS, + }, + )]); let candidate_with_less_errors = state .next_connection_candidate() @@ -1029,7 +1082,7 @@ mod tests { assert_eq!( candidate_with_less_errors.safekeeper_id, NodeId(1), - "Should select the node with less connection errors" + "Should select the node with no pending retry cooldown" ); Ok(()) @@ -1047,7 +1100,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(current_lsn), @@ -1055,6 +1108,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1134,7 +1188,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: time_over_threshold, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1142,6 +1196,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1206,7 +1261,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1214,6 +1269,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), @@ -1285,7 +1341,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 16a1f232e3..025bfeb506 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -27,7 +27,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; /// Status of the connection. @@ -35,8 +35,9 @@ use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; pub struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, - /// Defines a healthy connection as one on which we have received at least some WAL bytes. - pub has_received_wal: bool, + /// Defines a healthy connection as one on which pageserver received WAL from safekeeper + /// and is able to process it in walingest without errors. + pub has_processed_wal: bool, /// Connection establishment time or the timestamp of a latest connection message received. pub latest_connection_update: NaiveDateTime, /// Time of the latest WAL message received. @@ -71,7 +72,7 @@ pub async fn handle_walreceiver_connection( info!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: Utc::now().naive_utc(), latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, @@ -117,13 +118,6 @@ pub async fn handle_walreceiver_connection( let identify = identify_system(&mut replication_client).await?; info!("{identify:?}"); - connection_status.latest_connection_update = Utc::now().naive_utc(); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); - return Ok(()); - } - - // NB: this is a flush_lsn, not a commit_lsn. let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; let ZTenantTimelineId { @@ -131,6 +125,14 @@ pub async fn handle_walreceiver_connection( timeline_id, } = id; + connection_status.latest_connection_update = Utc::now().naive_utc(); + connection_status.latest_wal_update = Utc::now().naive_utc(); + connection_status.commit_lsn = Some(end_of_wal); + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + return Ok(()); + } + let (repo, timeline) = tokio::task::spawn_blocking(move || { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {tenant_id}"))?; @@ -181,6 +183,7 @@ pub async fn handle_walreceiver_connection( } { let replication_message = replication_message?; let now = Utc::now().naive_utc(); + let last_rec_lsn_before_msg = last_rec_lsn; // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. @@ -193,7 +196,6 @@ pub async fn handle_walreceiver_connection( )); if !xlog_data.data().is_empty() { connection_status.latest_wal_update = now; - connection_status.has_received_wal = true; } } ReplicationMessage::PrimaryKeepAlive(keepalive) => { @@ -265,6 +267,15 @@ pub async fn handle_walreceiver_connection( _ => None, }; + if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { + // We have successfully processed at least one WAL record. + connection_status.has_processed_wal = true; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 6b01d52005..c56b1c6c0c 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,9 +3,10 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; -use postgres_ffi::XLogRecord; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::v14::XLogRecord; +use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; @@ -618,7 +619,7 @@ pub fn decode_wal_record( blk.hole_length = 0; } } else { - blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len; + blk.hole_length = BLCKSZ - blk.bimg_len; } datatotal += blk.bimg_len as u32; blocks_total_len += blk.bimg_len as u32; @@ -628,9 +629,7 @@ pub fn decode_wal_record( * bimg_len < BLCKSZ if the HAS_HOLE flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 - && (blk.hole_offset == 0 - || blk.hole_length == 0 - || blk.bimg_len == pg_constants::BLCKSZ) + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) { // TODO /* @@ -667,7 +666,7 @@ pub fn decode_wal_record( * flag is set. */ if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == pg_constants::BLCKSZ + && blk.bimg_len == BLCKSZ { // TODO /* @@ -685,7 +684,7 @@ pub fn decode_wal_record( */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 - && blk.bimg_len != pg_constants::BLCKSZ + && blk.bimg_len != BLCKSZ { // TODO /* diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 85f970a941..9cf347573a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -44,11 +44,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; -use postgres_ffi::nonrelfile_utils::transaction_id_set_status; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::BLCKSZ; /// /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. @@ -82,24 +83,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -/// -/// A dummy WAL Redo Manager implementation that doesn't allow replaying -/// anything. Currently used during bootstrapping (zenith init), to create -/// a Repository object without launching the real WAL redo process. -/// -pub struct DummyRedoManager {} -impl crate::walredo::WalRedoManager for DummyRedoManager { - fn request_redo( - &self, - _key: Key, - _lsn: Lsn, - _base_img: Option, - _records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - Err(WalRedoError::InvalidState) - } -} - // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -435,10 +418,10 @@ impl PostgresRedoManager { } // Append the timestamp - if page.len() == pg_constants::BLCKSZ as usize + 8 { - page.truncate(pg_constants::BLCKSZ as usize); + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); } - if page.len() == pg_constants::BLCKSZ as usize { + if page.len() == BLCKSZ as usize { page.extend_from_slice(×tamp.to_be_bytes()); } else { warn!( @@ -759,7 +742,7 @@ impl PostgresRedoProcess { // We expect the WAL redo process to respond with an 8k page image. We read it // into this buffer. - let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()]; + let mut resultbuf = vec![0; BLCKSZ.into()]; let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far // Prepare for calling poll() @@ -772,7 +755,7 @@ impl PostgresRedoProcess { // We do three things simultaneously: send the old base image and WAL records to // the child process's stdin, read the result from child's stdout, and forward any logging // information that the child writes to its stderr to the page server's log. - while nresult < pg_constants::BLCKSZ.into() { + while nresult < BLCKSZ.into() { // If we have more data to write, wake up if 'stdin' becomes writeable or // we have data to read. Otherwise only wake up if there's data to read. let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b10ede8d5e..bb7e7ef67b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -86,7 +86,7 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { /// Legacy Cloud API (V1) + link auth. LegacyConsole(T), diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 896ef3588d..3bad36661b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -65,8 +65,17 @@ impl NodeInfo { // require for our business. let mut connection_error = None; let ports = self.config.get_ports(); - for (i, host) in self.config.get_hosts().iter().enumerate() { - let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let hosts = self.config.get_hosts(); + // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array + if ports.len() > 1 && ports.len() != hosts.len() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config), + )); + } + + for (i, host) in hosts.iter().enumerate() { + let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); let host = match host { Host::Tcp(host) => host.as_str(), Host::Unix(_) => continue, // unix sockets are not welcome here diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index f6e6133adf..05855e74df 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18; fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; - if !('a'..'z').contains(&attr) && !('A'..'Z').contains(&attr) { + if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { return None; } let eq = chars.next()?; diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 5e749796dd..91d2f61c10 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -40,7 +40,7 @@ struct SafeKeeperStateV1 { wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, @@ -70,7 +70,7 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV3 { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index a8121e829e..c90c2a0446 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -9,7 +9,7 @@ use crate::timeline::{Timeline, TimelineTools}; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::PG_TLI; +use postgres_ffi::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; @@ -90,7 +90,10 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - info!("got query {:?}", query_string); + info!( + "got query {:?} in timeline {:?}", + query_string, self.ztimelineid + ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); @@ -106,23 +109,17 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } match cmd { - SafekeeperPostgresCommand::StartWalPush => { - ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn")?; - } - SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn")?; - } - SafekeeperPostgresCommand::IdentifySystem => { - self.handle_identify_system(pgb)?; - } - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd)?; - } + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) + .run(self) + .context("failed to run ReceiveWalConn"), + SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) + .run(self, pgb, start_lsn) + .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } + .context(format!("timeline {timelineid}"))?; + Ok(()) } } @@ -153,8 +150,15 @@ impl SafekeeperPostgresHandler { /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { - let start_pos = self.timeline.get().get_end_of_wal(); - let lsn = start_pos.to_string(); + let lsn = if self.is_walproposer_recovery() { + // walproposer should get all local WAL until flush_lsn + self.timeline.get().get_end_of_wal() + } else { + // other clients shouldn't get any uncommitted WAL + self.timeline.get().get_state().0.commit_lsn + } + .to_string(); + let sysid = self .timeline .get() @@ -203,4 +207,11 @@ impl SafekeeperPostgresHandler { .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; Ok(()) } + + /// Returns true if current connection is a replication connection, originating + /// from a walproposer recovery function. This connection gets a special handling: + /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not. + pub fn is_walproposer_recovery(&self) -> bool { + self.appname == Some("wal_proposer_recovery".to_string()) + } } diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 97fb3654d2..3f84e7b183 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -7,8 +7,7 @@ //! use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; -use crc32c::crc32c_append; +use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; @@ -19,9 +18,8 @@ use crate::safekeeper::{ }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::TimelineTools; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils; -use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils; use utils::{ lsn::Lsn, postgres_backend::PostgresBackend, @@ -144,7 +142,7 @@ fn append_logical_message( spg: &mut SafekeeperPostgresHandler, msg: &AppendLogicalMessage, ) -> Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = spg.timeline.get().get_state().1; let begin_lsn = msg.begin_lsn; @@ -182,90 +180,3 @@ fn append_logical_message( append_response, }) } - -#[repr(C)] -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -struct XlLogicalMessage { - db_id: Oid, - transactional: uint32, // bool, takes 4 bytes due to alignment in C structures - prefix_size: uint64, - message_size: uint64, -} - -impl XlLogicalMessage { - pub fn encode(&self) -> Bytes { - use utils::bin_ser::LeSer; - self.ser().unwrap().into() - } -} - -/// Create new WAL record for non-transactional logical message. -/// Used for creating artificial WAL for tests, as LogicalMessage -/// record is basically no-op. -fn encode_logical_message(prefix: &str, message: &str) -> Vec { - let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1); - prefix_bytes.put(prefix.as_bytes()); - prefix_bytes.put_u8(0); - - let message_bytes = message.as_bytes(); - - let logical_message = XlLogicalMessage { - db_id: 0, - transactional: 0, - prefix_size: prefix_bytes.len() as u64, - message_size: message_bytes.len() as u64, - }; - - let mainrdata = logical_message.encode(); - let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); - // only short mainrdata is supported for now - assert!(mainrdata_len <= 255); - let mainrdata_len = mainrdata_len as u8; - - let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; - data.extend_from_slice(&mainrdata); - data.extend_from_slice(&prefix_bytes); - data.extend_from_slice(message_bytes); - - let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len(); - - let mut header = XLogRecord { - xl_tot_len: total_len as u32, - xl_xid: 0, - xl_prev: 0, - xl_info: 0, - xl_rmid: 21, - __bindgen_padding_0: [0u8; 2usize], - xl_crc: 0, // crc will be calculated later - }; - - let header_bytes = header.encode().expect("failed to encode header"); - let crc = crc32c_append(0, &data); - let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); - header.xl_crc = crc; - - let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode().expect("failed to encode header")); - wal.extend_from_slice(&data); - - // WAL start position must be aligned at 8 bytes, - // this will add padding for the next WAL record. - const PADDING: usize = 8; - let padding_rem = wal.len() % PADDING; - if padding_rem != 0 { - wal.resize(wal.len() + PADDING - padding_rem, 0); - } - - wal -} - -#[test] -fn test_encode_logical_message() { - let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, - 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, - 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, - ]; - let actual = encode_logical_message("prefix", "message"); - assert_eq!(expected, actual[..]); -} diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index fe4f9d231c..648f0634f8 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use metrics::{ proto::MetricFamily, Gauge, IntGaugeVec, }; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::{ diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a9373cb584..22f8ca2de4 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,9 +5,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use etcd_broker::subscription_value::SkTimelineInfo; -use postgres_ffi::xlog_utils::TimeLineID; - -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -19,7 +17,6 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, lsn::Lsn, @@ -127,7 +124,7 @@ impl AcceptorState { /// Information about Postgres. Safekeeper gets it once and then verifies /// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7439d6a8f6..97ec945c3e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -6,7 +6,7 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools}; use crate::wal_storage::WalReader; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; +use postgres_ffi::v14::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; @@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; /// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, @@ -170,6 +170,7 @@ impl ReplicationConn { // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); + let bg_timeline_id = spg.ztimelineid.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. @@ -188,6 +189,8 @@ impl ReplicationConn { let _ = thread::Builder::new() .name("HotStandbyFeedback thread".into()) .spawn(move || { + let _enter = + info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { error!("Replication background thread failed: {}", err); } @@ -198,13 +201,12 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (_, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = spg.timeline.get().get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check if persisted_state.server.wal_seg_size == 0 { bail!("Cannot start replication before connecting to walproposer"); } - let wal_end = spg.timeline.get().get_end_of_wal(); // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will // hang otherwise). That's because walproposer runs the consensus and @@ -214,8 +216,8 @@ impl ReplicationConn { // another compute rises which collects majority and starts fixing log // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) - { + let stop_pos: Option = if spg.is_walproposer_recovery() { + let wal_end = spg.timeline.get().get_end_of_wal(); Some(wal_end) } else { None @@ -226,7 +228,7 @@ impl ReplicationConn { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse)?; - let mut end_pos = Lsn(0); + let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( spg.conf.timeline_dir(&spg.timeline.get().zttid), diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 161fca3595..3a10c5d59e 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,8 +4,9 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; + use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use serde::Serialize; use tokio::sync::watch; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index b2f9d8d4f3..3552452470 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::PG_TLI; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs::File; use tokio::runtime::Builder; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5f4bf588c7..6a45ae1411 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,10 @@ use std::pin::Pin; use tokio::io::AsyncRead; use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI, +use postgres_ffi::v14::xlog_utils::{ + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, }; +use postgres_ffi::PG_TLI; use std::cmp::min; use std::fs::{self, remove_file, File, OpenOptions}; @@ -30,9 +31,10 @@ use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; +use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; diff --git a/setup.cfg b/setup.cfg index d1a2f9a359..7f8c45c8c3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,10 @@ exclude = ^vendor/ # some tests don't typecheck when this flag is set check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner + disallow_incomplete_defs = false disallow_untyped_calls = false disallow_untyped_decorators = false diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 51df41699a..710b220ae8 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,7 +1,11 @@ from typing import Optional from uuid import uuid4, UUID import pytest +import pathlib +import os +import subprocess from fixtures.utils import lsn_from_hex +from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -9,16 +13,43 @@ from fixtures.neon_fixtures import ( NeonPageserverHttpClient, NeonPageserverApiException, wait_until, + neon_binpath, + pg_distrib_dir, ) -# test that we cannot override node id -def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init() - with pytest.raises( - Exception, - match="node id can only be set during pageserver init and cannot be overridden"): - env.pageserver.start(overrides=['--pageserver-config-override=id=10']) +# test that we cannot override node id after init +def test_pageserver_init_node_id(neon_simple_env: NeonEnv): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / 'pageserver.toml' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # remove initial config + pageserver_config.unlink() + + bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), 'config file should not be created after init error' + + completed_init = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' + assert pageserver_config.exists(), 'config file should be created successfully' + + bad_reinit = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) + assert bad_update.returncode == 1, 'pageserver should not allow updating node id' + assert "has node id already, it cannot be overridden" in bad_update.stderr def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): diff --git a/test_runner/batch_others/test_setup.py b/test_runner/batch_others/test_setup.py new file mode 100644 index 0000000000..3d1471621b --- /dev/null +++ b/test_runner/batch_others/test_setup.py @@ -0,0 +1,17 @@ +"""Tests for the code in test fixtures""" + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that pageserver and safekeeper can restart quickly. +# This is a regression test, see https://github.com/neondatabase/neon/issues/2247 +def test_fixture_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + for i in range(3): + env.pageserver.stop() + env.pageserver.start() + + for i in range(3): + env.safekeepers[0].stop() + env.safekeepers[0].start() diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 176ca740fe..eb65e2e3b5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -44,30 +44,22 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ - cmd = [ - str(pageserver_bin), - '--init', - '--workdir', - str(new_pageserver_dir), - f"-c listen_pg_addr='localhost:{pg_port}'", - f"-c listen_http_addr='localhost:{http_port}'", - f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c id=2", - f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - ] - - if broker is not None: - cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) - - subprocess.check_output(cmd, text=True) - # actually run new pageserver cmd = [ str(pageserver_bin), '--workdir', str(new_pageserver_dir), '--daemonize', + '--update-config', + f"-c listen_pg_addr='localhost:{pg_port}'", + f"-c listen_http_addr='localhost:{http_port}'", + f"-c pg_distrib_dir='{pg_distrib_dir}'", + f"-c id=2", + f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] + if broker is not None: + cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) log.info("started new pageserver %s", out)