From 18f251384d68b1d39fc885b4d3416f363e85fe27 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 16 Aug 2022 11:10:38 +0300 Subject: [PATCH 01/21] Check for entire range during sasl validation (#2281) --- proxy/src/scram/messages.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index f6e6133adf..05855e74df 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18; fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { for mut chars in parts.map(|s| s.chars()) { let attr = chars.next()?; - if !('a'..'z').contains(&attr) && !('A'..'Z').contains(&attr) { + if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { return None; } let eq = chars.next()?; From b8f0f37de22034363ac895104eebbe0ccb0bbfd8 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 11:15:35 +0200 Subject: [PATCH 02/21] Gen2 GH runner (#2128) * Re-add rustup override * Try s3 bucket * Set git version * Use v4 cache key to prevent problems * Switch to v5 for key * Add second rustup fix * Rebase * Add kaniko steps * Fix typo and set compress level * Disable global run default * Specify shell for step * Change approach with kaniko * Try less verbose shell spec * Add submodule pull * Add promote step * Adjust dependency chain * Try default swap again * Use env * Don't override aws key * Make kaniko build conditional * Specify runs on * Try without dependency link * Try soft fail * Use image with git * Try passing to next step * Fix duplicate * Try other approach * Try other approach * Fix typo * Try other syntax * Set env * Adjust setup * Try step 1 * Add link * Try global env * Fix mistake * Debug * Try other syntax * Try other approach * Change order * Move output one step down * Put output up one level * Try other syntax * Skip build * Try output * Re-enable build * Try other syntax * Skip middle step * Update check * Try first step of dockerhub push * Update needs dependency * Try explicit dir * Add missing package * Try other approach * Try other approach * Specify region * Use with * Try other approach * Add debug * Try other approach * Set region * Follow AWS example * Try github approach * Skip Qemu * Try stdin * Missing steps * Add missing close * Add echo debug * Try v2 endpoint * Use v1 endpoint * Try without quotes * Revert * Try crane * Add debug * Split steps * Fix duplicate * Add shell step * Conform to options * Add verbose flag * Try single step * Try workaround * First request fails hunch * Try bullseye image * Try other approach * Adjust verbose level * Try previous step * Add more debug * Remove debug step * Remove rogue indent * Try with larger image * Add build tag step * Update workflow for testing * Add tag step for test * Remove unused * Update dependency chain * Add ownership fix * Use matrix for promote * Force update * Force build * Remove unused * Add new image * Add missing argument * Update dockerfile copy * Update Dockerfile * Update clone * Update dockerfile * Go to correct folder * Use correct format * Update dockerfile * Remove cd * Debug find where we are * Add debug on first step * Changedir to postgres * Set workdir * Use v1 approach * Use other dependency * Try other approach * Try other approach * Update dockerfile * Update approach * Update dockerfile * Update approach * Update dockerfile * Update dockerfile * Add workspace hack * Update Dockerfile * Update Dockerfile * Update Dockerfile * Change last step * Cleanup pull in prep for review * Force build images * Add condition for latest tagging * Use pinned version * Try without name value * Remove more names * Shorten names * Add kaniko comments * Pin kaniko * Pin crane and ecr helper * Up one level * Switch to pinned tag for rust image * Force update for test Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 312 +++++++++++++++------------ 1 file changed, 171 insertions(+), 141 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 635c6126cc..99859197a1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -7,10 +7,6 @@ on: - release pull_request: -defaults: - run: - shell: bash -euxo pipefail {0} - concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} @@ -23,7 +19,7 @@ env: jobs: build-neon: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned strategy: fail-fast: false matrix: @@ -35,7 +31,7 @@ jobs: GIT_VERSION: ${{ github.sha }} steps: - - name: Fix git ownerwhip + - name: Fix git ownership run: | # Workaround for `fatal: detected dubious ownership in repository at ...` # @@ -54,6 +50,7 @@ jobs: - name: Set pg revision for caching id: pg_ver run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. # @@ -77,6 +74,7 @@ jobs: echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + shell: bash -euxo pipefail {0} # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache @@ -93,8 +91,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -106,14 +104,17 @@ jobs: - name: Build postgres if: steps.cache_pg.outputs.cache-hit != 'true' run: mold -run make postgres -j$(nproc) + shell: bash -euxo pipefail {0} - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + shell: bash -euxo pipefail {0} - name: Run cargo test run: | ${cov_prefix} cargo test $CARGO_FLAGS + shell: bash -euxo pipefail {0} - name: Install rust binaries run: | @@ -154,9 +155,11 @@ jobs: echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done fi + shell: bash -euxo pipefail {0} - name: Install postgres binaries run: cp -a tmp_install /tmp/neon/pg_install + shell: bash -euxo pipefail {0} - name: Upload Neon artifact uses: ./.github/actions/upload @@ -171,7 +174,7 @@ jobs: pg_regress-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] strategy: fail-fast: false @@ -199,7 +202,7 @@ jobs: other-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] strategy: fail-fast: false @@ -230,7 +233,7 @@ jobs: benchmarks: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -261,7 +264,7 @@ jobs: coverage-report: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948 + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -284,7 +287,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -300,6 +303,7 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + shell: bash -euxo pipefail {0} - name: Build and upload coverage report run: | @@ -332,9 +336,11 @@ jobs: \"description\": \"Coverage report is ready\", \"target_url\": \"$REPORT_URL\" }" + shell: bash -euxo pipefail {0} trigger-e2e-tests: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned needs: [ build-neon ] steps: - name: Set PR's status to pending and request a remote CI test @@ -369,32 +375,111 @@ jobs: } }" - docker-image: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + dockerfile-check: + if: github.event_name != 'workflow_dispatch' + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest outputs: - build-tag: ${{steps.build-tag.outputs.tag}} + value: ${{ steps.dockerfile-check.outputs.any_changed }} steps: - name: Checkout uses: actions/checkout@v3 + + - name: Get specific changed files + id: dockerfile-check + uses: tj-actions/changed-files@802732316a11c01531ea72773ec7998155238e31 # v25 + with: + files: | + Dockerfile + Dockerfile.compute-tools + ./vendor/postgres/Dockerfile + + neon-image: + # force building for all 3 images + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko with: submodules: true fetch-depth: 0 - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker + - name: Kaniko build console + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID + compute-tools-image: + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build console + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID + + compute-node-image: + if: needs.dockerfile-check.outputs.value != 'true' + runs-on: dev + needs: [ dockerfile-check ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + environment: dev + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build console + working-directory: ./vendor/postgres/ + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID + + promote-images: + runs-on: dev + needs: [ neon-image, compute-tools-image, compute-node-image ] + if: github.event_name != 'workflow_dispatch' + container: amazon/aws-cli + strategy: + fail-fast: false + matrix: + name: [ neon, compute-tools, compute-node ] + + steps: + - name: Promote image to latest + run: + MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + + push-docker-hub: + runs-on: dev + needs: [ promote-images ] + container: golang:1.19-bullseye + environment: dev + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - name: Get build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -402,117 +487,48 @@ jobs: elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "::set-output name=tag::release-$(git rev-list --count HEAD)" else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "::set-output name=tag::$GITHUB_RUN_ID" fi id: build-tag - - name: Get legacy build tag + - name: Configure ECR login run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - name: Build neon Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - pull: true - push: true - tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} + - name: Pull neon image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon - docker-image-compute: - runs-on: [ self-hosted, Linux, k8s-runner ] - needs: [ pg_regress-tests, other-tests ] - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - outputs: - build-tag: ${{steps.build-tag.outputs.tag}} - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 + - name: Pull compute tools image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - name: Pull compute node image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - with: - driver: docker - - - name: Get build tag + - name: Configure docker login run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: build-tag + # ECR Credential Helper & Docker Hub don't work together in config, hence reset + echo "" > /github/home/.docker/config.json + crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - name: Get legacy build tag + - name: Push neon image to Docker Hub + run: crane push neon neondatabase/neon:${{steps.build-tag.outputs.tag}} + + - name: Push compute tools image to Docker Hub + run: crane push compute-tools neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} + + - name: Push compute node image to Docker Hub + run: crane push compute-node neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + + - name: Add latest tag to images + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::latest" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" - exit 1 - fi - id: legacy-build-tag - - - name: Build compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: false - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:local - - - name: Push compute-tools Docker image - uses: docker/build-push-action@v2 - with: - context: . - build-args: | - GIT_VERSION="${{github.sha}}" - AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" - AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" - push: true - file: Dockerfile.compute-tools - tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} - - - name: Build compute-node Docker image - uses: docker/build-push-action@v2 - with: - context: ./vendor/postgres/ - build-args: - COMPUTE_TOOLS_TAG=local - push: true - tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + crane tag neondatabase/neon:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/compute-node:${{steps.build-tag.outputs.tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -537,15 +553,17 @@ jobs: fi deploy: - runs-on: [ self-hosted, Linux, k8s-runner ] - # We need both storage **and** compute images for deploy, because control plane - # picks the compute version based on the storage version. If it notices a fresh - # storage it may bump the compute version. And if compute image failed to build - # it may break things badly. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -556,8 +574,14 @@ jobs: submodules: true fetch-depth: 0 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Setup ansible run: | + export PATH="/root/.local/bin:$PATH" pip install --progress-bar off --user ansible boto3 - name: Redeploy @@ -585,13 +609,16 @@ jobs: rm -f neon_install.tar.gz .neon_current_version deploy-proxy: - runs-on: [ self-hosted, Linux, k8s-runner ] - # Compute image isn't strictly required for proxy deploy, but let's still wait for it - # to run all deploy jobs consistently. - needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash strategy: matrix: include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} @@ -604,6 +631,9 @@ jobs: submodules: true fetch-depth: 0 + - name: Add curl + run: apt update && apt install curl -y + - name: Store kubeconfig file run: | echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} @@ -618,4 +648,4 @@ jobs: run: | DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file From 83f7b8ed2298f4d94b14501ec6c3d2bc2f9e8c80 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 13:41:51 +0200 Subject: [PATCH 03/21] Add missing step output, revert one deploy step (#2285) * Add missing step output, revert one deploy step * Conform to syntax * Update approach * Add missing value * Add missing needs Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 66 ++++++++++++++++++---------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 99859197a1..57214e0bfe 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -17,6 +17,24 @@ env: COPT: '-Werror' jobs: + tag: + runs-on: dev + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "::set-output name=tag::$GITHUB_RUN_ID" + fi + id: build-tag + build-neon: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned @@ -470,7 +488,7 @@ jobs: push-docker-hub: runs-on: dev - needs: [ promote-images ] + needs: [ promote-images, tag ] container: golang:1.19-bullseye environment: dev @@ -480,17 +498,17 @@ jobs: go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 - - name: Get build tag - run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - echo "::set-output name=tag::$(git rev-list --count HEAD)" - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - echo "::set-output name=tag::release-$(git rev-list --count HEAD)" - else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " - echo "::set-output name=tag::$GITHUB_RUN_ID" - fi - id: build-tag +# - name: Get build tag +# run: | +# if [[ "$GITHUB_REF_NAME" == "main" ]]; then +# echo "::set-output name=tag::$(git rev-list --count HEAD)" +# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then +# echo "::set-output name=tag::release-$(git rev-list --count HEAD)" +# else +# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " +# echo "::set-output name=tag::$GITHUB_RUN_ID" +# fi +# id: build-tag - name: Configure ECR login run: | @@ -513,22 +531,22 @@ jobs: crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - name: Push neon image to Docker Hub - run: crane push neon neondatabase/neon:${{steps.build-tag.outputs.tag}} + run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} - name: Push compute tools image to Docker Hub - run: crane push compute-tools neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} + run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} - name: Push compute node image to Docker Hub - run: crane push compute-node neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}} - name: Add latest tag to images if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' run: | - crane tag neondatabase/neon:${{steps.build-tag.outputs.tag}} latest - crane tag neondatabase/compute-tools:${{steps.build-tag.outputs.tag}} latest - crane tag neondatabase/compute-node:${{steps.build-tag.outputs.tag}} latest + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: runs-on: [ self-hosted, Linux, k8s-runner ] @@ -553,11 +571,11 @@ jobs: fi deploy: - runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + runs-on: [ self-hosted, Linux, k8s-runner ] + #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets ] + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -586,7 +604,7 @@ jobs: - name: Redeploy run: | - export DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} cd "$(pwd)/.github/ansible" if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -612,7 +630,7 @@ jobs: runs-on: dev container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. - needs: [ push-docker-hub, calculate-deploy-targets ] + needs: [ push-docker-hub, calculate-deploy-targets, tag ] if: | (github.ref_name == 'main' || github.ref_name == 'release') && github.event_name != 'workflow_dispatch' @@ -646,6 +664,6 @@ jobs: - name: Re-deploy proxy run: | - DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file From 4cde0e7a37d15ec1c6a5ca3a074a0a950a08cd60 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 13:59:41 +0200 Subject: [PATCH 04/21] Error for fatal not git repo (#2286) Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 57214e0bfe..004797e502 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -19,10 +19,14 @@ env: jobs: tag: runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest outputs: build-tag: ${{steps.build-tag.outputs.tag}} steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Get build tag run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then @@ -33,6 +37,7 @@ jobs: echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " echo "::set-output name=tag::$GITHUB_RUN_ID" fi + shell: bash id: build-tag build-neon: From 1d4114183c09b01069370a19c1d4a855f8bf2571 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 15:41:31 +0200 Subject: [PATCH 05/21] Use main, not branch for ref check (#2288) * Use main, not branch for ref check * Add more debug * Count main, not head * Try new approach * Conform to syntax * Update approach * Get full history * Skip checkout * Cleanup debug * Remove more debug Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 004797e502..2f5c03f794 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -26,15 +26,20 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Get build tag run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) if [[ "$GITHUB_REF_NAME" == "main" ]]; then echo "::set-output name=tag::$(git rev-list --count HEAD)" elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "::set-output name=tag::release-$(git rev-list --count HEAD)" else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' " + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" echo "::set-output name=tag::$GITHUB_RUN_ID" fi shell: bash From 9218426e41417975d817fb5464ea83e2cf2f5aba Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Tue, 16 Aug 2022 17:24:58 +0200 Subject: [PATCH 06/21] Fix docker zombie process issue (#2289) * Fix docker zombie process issue * Init everywhere Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 2f5c03f794..9425ceb536 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -47,7 +47,9 @@ jobs: build-neon: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init strategy: fail-fast: false matrix: @@ -202,7 +204,9 @@ jobs: pg_regress-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -230,7 +234,9 @@ jobs: other-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] strategy: fail-fast: false @@ -261,7 +267,9 @@ jobs: benchmarks: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: @@ -292,7 +300,9 @@ jobs: coverage-report: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ other-tests, pg_regress-tests ] strategy: fail-fast: false @@ -368,7 +378,9 @@ jobs: trigger-e2e-tests: runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init needs: [ build-neon ] steps: - name: Set PR's status to pending and request a remote CI test From 648e8bbefeadcf8c81d12f3598cca0794c57a284 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 16 Aug 2022 18:49:22 +0300 Subject: [PATCH 07/21] Fix 1.63 clippy lints (#2282) --- libs/postgres_ffi/src/relfile_utils.rs | 2 +- libs/postgres_ffi/src/xlog_utils.rs | 4 ++-- libs/utils/src/bin_ser.rs | 4 ++-- libs/utils/src/http/request.rs | 10 ++++------ libs/utils/src/lsn.rs | 2 +- libs/utils/src/postgres_backend.rs | 2 +- libs/utils/src/pq_proto.rs | 2 +- libs/utils/src/seqwait.rs | 2 +- libs/utils/tests/bin_ser_test.rs | 2 +- libs/utils/tests/ssl_test.rs | 3 +++ pageserver/src/http/routes.rs | 2 +- pageserver/src/layered_repository/disk_btree.rs | 2 +- proxy/src/auth/backend.rs | 2 +- proxy/src/compute.rs | 13 +++++++++++-- safekeeper/src/control_file_upgrade.rs | 4 ++-- safekeeper/src/safekeeper.rs | 2 +- safekeeper/src/send_wal.rs | 2 +- 17 files changed, 35 insertions(+), 25 deletions(-) diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index 94498ee9a9..cc9d6470c0 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -5,7 +5,7 @@ use crate::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; -#[derive(Debug, Clone, thiserror::Error, PartialEq)] +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] InvalidForkName, diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 29b00c8d36..956f53ce85 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -80,12 +80,12 @@ pub fn XLogSegNoOffsetToRecPtr( #[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { - return format!( + format!( "{:>08X}{:>08X}{:>08X}", tli, logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) - ); + ) } #[allow(non_snake_case)] diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 70f54ea02f..42b45eeea0 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -265,7 +265,7 @@ mod tests { use serde::{Deserialize, Serialize}; use std::io::Cursor; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, b: u32, @@ -286,7 +286,7 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, pub blockpos: u32, diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 8e3d357397..4984d695fd 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -10,12 +10,10 @@ pub fn get_request_param<'a>( ) -> Result<&'a str, ApiError> { match request.param(param_name) { Some(arg) => Ok(arg), - None => { - return Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name - ))) - } + None => Err(ApiError::BadRequest(format!( + "no {} specified in path param", + param_name + ))), } } diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 3dab2a625c..1090f4c679 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -18,7 +18,7 @@ pub const XLOG_BLCKSZ: u32 = 8192; pub struct Lsn(pub u64); /// We tried to parse an LSN from a string, but failed -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("LsnParseError")] pub struct LsnParseError; diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 79dca96fcf..4d873bd5ac 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -50,7 +50,7 @@ pub trait Handler { /// PostgresBackend protocol state. /// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, PartialOrd)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { Initialization, Encrypted, diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index 3dcae4d0af..3f14acd50d 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -930,7 +930,7 @@ impl<'a> BeMessage<'a> { // Neon extension of postgres replication protocol // See NEON_STATUS_UPDATE_TAG_BYTE -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index bc32f51b13..a531975d60 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -9,7 +9,7 @@ use std::sync::Mutex; use std::time::Duration; /// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached diff --git a/libs/utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs index f357837a55..b995b61b78 100644 --- a/libs/utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use std::io::Read; use utils::bin_ser::LeSer; -#[derive(Debug, PartialEq, Deserialize)] +#[derive(Debug, PartialEq, Eq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, diff --git a/libs/utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs index 907ef98aec..248400c2c1 100644 --- a/libs/utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -30,6 +30,9 @@ static CERT: Lazy = Lazy::new(|| { }); #[test] +// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), +// we resize the vector so doing some modifications after all +#[allow(clippy::read_zero_byte_vec)] fn ssl() { let (mut client_sock, server_sock) = make_tcp_pair(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1b1b4f99cb..81ecdb7404 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -167,7 +167,7 @@ fn local_timeline_info_from_repo_timeline( ) -> anyhow::Result { match repo_timeline { RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline( - &*timeline, + timeline, include_non_incremental_logical_size, include_non_incremental_physical_size, ), diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index dc8d7a2ad3..c130a42a8e 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -209,7 +209,7 @@ where reader: R, } -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum VisitDirection { Forwards, Backwards, diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index b10ede8d5e..bb7e7ef67b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -86,7 +86,7 @@ impl From for tokio_postgres::Config { /// * However, when we substitute `T` with [`ClientCredentials`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BackendType { /// Legacy Cloud API (V1) + link auth. LegacyConsole(T), diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 896ef3588d..3bad36661b 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -65,8 +65,17 @@ impl NodeInfo { // require for our business. let mut connection_error = None; let ports = self.config.get_ports(); - for (i, host) in self.config.get_hosts().iter().enumerate() { - let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432); + let hosts = self.config.get_hosts(); + // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array + if ports.len() > 1 && ports.len() != hosts.len() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config), + )); + } + + for (i, host) in hosts.iter().enumerate() { + let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); let host = match host { Host::Tcp(host) => host.as_str(), Host::Unix(_) => continue, // unix sockets are not welcome here diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 5e749796dd..91d2f61c10 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -40,7 +40,7 @@ struct SafeKeeperStateV1 { wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV2 { /// Postgres server version pub pg_version: u32, @@ -70,7 +70,7 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV3 { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index a9373cb584..88747f14e5 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -127,7 +127,7 @@ impl AcceptorState { /// Information about Postgres. Safekeeper gets it once and then verifies /// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version pub pg_version: u32, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 7439d6a8f6..4a9c56859f 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; /// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct HotStandbyFeedback { pub ts: TimestampTz, pub xmin: FullTransactionId, From b21f7382ccdc5512eb11f3467fe0f5468e0fa543 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Mon, 15 Aug 2022 23:16:35 +0300 Subject: [PATCH 08/21] split out timeline metrics, track layer map loading and size calculation --- pageserver/src/layered_repository/timeline.rs | 165 +++++++++++------- 1 file changed, 98 insertions(+), 67 deletions(-) diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 2d396024a0..e27619cc83 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -4,6 +4,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; use fail::fail_point; use itertools::Itertools; +use metrics::core::{AtomicU64, GenericCounter}; use once_cell::sync::Lazy; use tracing::*; @@ -223,6 +224,70 @@ impl From for RepositoryTimeline { } } +struct TimelineMetrics { + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, +} + +impl TimelineMetrics { + fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + + TimelineMetrics { + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + } + } +} + pub struct LayeredTimeline { conf: &'static PageServerConf, tenant_conf: Arc>, @@ -269,14 +334,7 @@ pub struct LayeredTimeline { ancestor_lsn: Lsn, // Metrics - reconstruct_time_histo: Histogram, - materialized_page_cache_hit_counter: IntCounter, - flush_time_histo: Histogram, - compact_time_histo: Histogram, - create_images_time_histo: Histogram, - last_record_gauge: IntGauge, - wait_lsn_time_histo: Histogram, - current_physical_size_gauge: UIntGauge, + metrics: TimelineMetrics, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_layers: AtomicBool, @@ -426,7 +484,7 @@ impl Timeline for LayeredTimeline { "wait_lsn called by WAL receiver thread" ); - self.wait_lsn_time_histo.observe_closure_duration( + self.metrics.wait_lsn_time_histo.observe_closure_duration( || self.last_record_lsn .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) .with_context(|| { @@ -468,7 +526,8 @@ impl Timeline for LayeredTimeline { self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; - self.reconstruct_time_histo + self.metrics + .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } @@ -530,7 +589,7 @@ impl Timeline for LayeredTimeline { } fn get_physical_size(&self) -> u64 { - self.current_physical_size_gauge.get() + self.metrics.current_physical_size_gauge.get() } fn get_physical_size_non_incremental(&self) -> anyhow::Result { @@ -604,43 +663,6 @@ impl LayeredTimeline { walredo_mgr: Arc, upload_layers: bool, ) -> LayeredTimeline { - let reconstruct_time_histo = RECONSTRUCT_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let flush_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "layer flush", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let compact_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "compact", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let create_images_time_histo = STORAGE_TIME - .get_metric_with_label_values(&[ - "create images", - &tenant_id.to_string(), - &timeline_id.to_string(), - ]) - .unwrap(); - let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()]) - .unwrap(); - let mut result = LayeredTimeline { conf, tenant_conf, @@ -663,14 +685,7 @@ impl LayeredTimeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - reconstruct_time_histo, - materialized_page_cache_hit_counter, - flush_time_histo, - compact_time_histo, - create_images_time_histo, - last_record_gauge, - wait_lsn_time_histo, - current_physical_size_gauge, + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), upload_layers: AtomicBool::new(upload_layers), @@ -706,6 +721,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); let mut num_layers = 0; + let timer = self.metrics.load_layer_map_histo.start_timer(); + // Scan timeline directory and create ImageFileName and DeltaFilename // structs representing all files on disk let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); @@ -777,7 +794,11 @@ impl LayeredTimeline { "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); - self.current_physical_size_gauge.set(total_physical_size); + self.metrics + .current_physical_size_gauge + .set(total_physical_size); + + timer.stop_and_record(); Ok(()) } @@ -808,12 +829,16 @@ impl LayeredTimeline { } } + let timer = self.metrics.init_logical_size_histo.start_timer(); + // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; self.current_logical_size .store(logical_size as isize, AtomicOrdering::SeqCst); debug!("calculated logical size the hard way: {}", logical_size); + + timer.stop_and_record(); Ok(()) } @@ -878,7 +903,7 @@ impl LayeredTimeline { ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { - self.materialized_page_cache_hit_counter.inc_by(1); + self.metrics.materialized_page_cache_hit_counter.inc_by(1); return Ok(()); } if prev_lsn <= cont_lsn { @@ -1074,7 +1099,7 @@ impl LayeredTimeline { fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); - self.last_record_gauge.set(new_lsn.0 as i64); + self.metrics.last_record_gauge.set(new_lsn.0 as i64); self.last_record_lsn.advance(new_lsn); } @@ -1178,7 +1203,7 @@ impl LayeredTimeline { } }; - let timer = self.flush_time_histo.start_timer(); + let timer = self.metrics.flush_time_histo.start_timer(); loop { let layers = self.layers.read().unwrap(); @@ -1349,7 +1374,7 @@ impl LayeredTimeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.current_physical_size_gauge.add(sz); + self.metrics.current_physical_size_gauge.add(sz); // update metrics NUM_PERSISTENT_FILES_CREATED.inc_by(1); PERSISTENT_BYTES_WRITTEN.inc_by(sz); @@ -1418,7 +1443,7 @@ impl LayeredTimeline { } // 3. Compact - let timer = self.compact_time_histo.start_timer(); + let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size)?; timer.stop_and_record(); } @@ -1494,7 +1519,7 @@ impl LayeredTimeline { lsn: Lsn, force: bool, ) -> Result> { - let timer = self.create_images_time_histo.start_timer(); + let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); let mut layer_paths_to_upload = HashSet::new(); for partition in partitioning.parts.iter() { @@ -1538,7 +1563,8 @@ impl LayeredTimeline { let mut layers = self.layers.write().unwrap(); for l in image_layers { - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(l.path().metadata()?.len()); layers.insert_historic(Arc::new(l)); } @@ -1788,7 +1814,8 @@ impl LayeredTimeline { let new_delta_path = l.path(); // update the timeline's physical size - self.current_physical_size_gauge + self.metrics + .current_physical_size_gauge .add(new_delta_path.metadata()?.len()); new_layer_paths.insert(new_delta_path); @@ -1801,7 +1828,9 @@ impl LayeredTimeline { drop(all_keys_iter); for l in deltas_to_compact { if let Some(path) = l.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_do_delete.insert(path); } l.delete()?; @@ -2058,7 +2087,9 @@ impl LayeredTimeline { let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); for doomed_layer in layers_to_remove { if let Some(path) = doomed_layer.local_path() { - self.current_physical_size_gauge.sub(path.metadata()?.len()); + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); layer_paths_to_delete.insert(path); } doomed_layer.delete()?; From d5ec84b87bc5d11c4e77df77a515356ef33a4e32 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Tue, 16 Aug 2022 01:03:17 +0300 Subject: [PATCH 09/21] reset rust cache for clippy run to avoid an ICE additionally remove trailing whitespaces --- .github/workflows/build_and_test.yml | 6 +++--- .github/workflows/codestyle.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9425ceb536..884187cec2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -121,8 +121,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} - v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- - name: Cache postgres build id: cache_pg @@ -688,4 +688,4 @@ jobs: run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s \ No newline at end of file + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml index aa37167a19..6f13a38dea 100644 --- a/.github/workflows/codestyle.yml +++ b/.github/workflows/codestyle.yml @@ -101,7 +101,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} + key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh From e94a5ce3606ecd6b8fddf85c087d33d4e90c11a4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 29 Jul 2022 16:43:30 +0300 Subject: [PATCH 10/21] Rename pg_control_ffi.h to bindgen_deps.h, for clarity. The pg_control_ffi.h name implies that it only includes stuff related to pg_control.h. That's mostly true currently, but really the point of the file is to include everything that we need to generate Rust definitions from. --- libs/postgres_ffi/{pg_control_ffi.h => bindgen_deps.h} | 0 libs/postgres_ffi/build.rs | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename libs/postgres_ffi/{pg_control_ffi.h => bindgen_deps.h} (100%) diff --git a/libs/postgres_ffi/pg_control_ffi.h b/libs/postgres_ffi/bindgen_deps.h similarity index 100% rename from libs/postgres_ffi/pg_control_ffi.h rename to libs/postgres_ffi/bindgen_deps.h diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 7db2c20e34..69b2711c22 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -44,7 +44,7 @@ impl ParseCallbacks for PostgresFfiCallbacks { fn main() { // Tell cargo to invalidate the built crate whenever the wrapper changes - println!("cargo:rerun-if-changed=pg_control_ffi.h"); + println!("cargo:rerun-if-changed=bindgen_deps.h"); // Finding the location of C headers for the Postgres server: // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` @@ -88,9 +88,9 @@ fn main() { // the resulting bindings. let bindings = bindgen::Builder::default() // - // All the needed PostgreSQL headers are included from 'pg_control_ffi.h' + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' // - .header("pg_control_ffi.h") + .header("bindgen_deps.h") // // Tell cargo to invalidate the built crate whenever any of the // included header files changed. From 3414feae037e954a4630309114f196a84c1198a8 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 17 Aug 2022 08:17:09 -0400 Subject: [PATCH 11/21] Make local mypy behave like CI mypy (#2291) --- setup.cfg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.cfg b/setup.cfg index d1a2f9a359..7f8c45c8c3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,10 @@ exclude = ^vendor/ # some tests don't typecheck when this flag is set check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner + disallow_incomplete_defs = false disallow_untyped_calls = false disallow_untyped_decorators = false From e9a3499e87c2f87632849b501282387e2a9dcbeb Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Wed, 17 Aug 2022 08:17:35 -0400 Subject: [PATCH 12/21] Fix flaky pageserver restarts in tests (#2261) --- control_plane/src/safekeeper.rs | 45 +++++++++----------------- control_plane/src/storage.rs | 44 ++++++++----------------- test_runner/batch_others/test_setup.py | 17 ++++++++++ 3 files changed, 46 insertions(+), 60 deletions(-) create mode 100644 test_runner/batch_others/test_setup.py diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 0cae479d71..3fda856e13 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,5 +1,4 @@ use std::io::Write; -use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; use std::sync::Arc; @@ -241,37 +240,23 @@ impl SafekeeperNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if safekeeper flushes a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nSafekeeper connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port - match self.check_status() { - Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nSafekeeper status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; + if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index d2742e84bb..31858278d3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; -use std::net::TcpStream; use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; @@ -312,38 +311,23 @@ impl PageServerNode { ), } - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if pageserver checkpoints a lot of data - let mut tcp_stopped = false; + // Wait until process is gone for i in 0..600 { - if !tcp_stopped { - if let Err(err) = TcpStream::connect(&address) { - tcp_stopped = true; - if err.kind() != io::ErrorKind::ConnectionRefused { - eprintln!("\nPageserver connection failed with error: {err}"); - } + let signal = None; // Send no signal, just get the error code + match kill(pid, signal) { + Ok(_) => (), // Process exists, keep waiting + Err(Errno::ESRCH) => { + // Process not found, we're done + println!("done!"); + return Ok(()); } - } - if tcp_stopped { - // Also check status on the HTTP port + Err(err) => bail!( + "Failed to send signal to pageserver with pid {}: {}", + pid, + err.desc() + ), + }; - match self.check_status() { - Err(PageserverHttpError::Transport(err)) if err.is_connect() => { - println!("done!"); - return Ok(()); - } - Err(err) => { - eprintln!("\nPageserver status check failed with error: {err}"); - return Ok(()); - } - Ok(()) => { - // keep waiting - } - } - } if i % 10 == 0 { print!("."); io::stdout().flush().unwrap(); diff --git a/test_runner/batch_others/test_setup.py b/test_runner/batch_others/test_setup.py new file mode 100644 index 0000000000..3d1471621b --- /dev/null +++ b/test_runner/batch_others/test_setup.py @@ -0,0 +1,17 @@ +"""Tests for the code in test fixtures""" + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that pageserver and safekeeper can restart quickly. +# This is a regression test, see https://github.com/neondatabase/neon/issues/2247 +def test_fixture_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + for i in range(3): + env.pageserver.stop() + env.pageserver.start() + + for i in range(3): + env.safekeepers[0].stop() + env.safekeepers[0].start() From 3b819ee159d9f363c2cb354423f14c808399ab56 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 17 Aug 2022 17:51:53 +0300 Subject: [PATCH 13/21] Remove extra type aliases (#2280) --- pageserver/src/http/routes.rs | 7 +++---- pageserver/src/layered_repository.rs | 4 +++- pageserver/src/lib.rs | 4 ---- pageserver/src/repository.rs | 5 ++--- pageserver/src/tenant_mgr.rs | 19 +++++++++---------- pageserver/src/timelines.rs | 11 +++++++---- .../src/walreceiver/connection_manager.rs | 14 +++++--------- 7 files changed, 29 insertions(+), 35 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 81ecdb7404..1d0adec63d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -11,14 +11,13 @@ use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, }; -use crate::layered_repository::metadata::TimelineMetadata; +use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline}; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{LocalTimelineState, RepositoryTimeline}; use crate::repository::{Repository, Timeline}; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; -use crate::TimelineImpl; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ auth::JwtAuth, @@ -86,7 +85,7 @@ fn get_config(request: &Request) -> &'static PageServerConf { // Helper functions to construct a LocalTimelineInfo struct for a timeline fn local_timeline_info_from_loaded_timeline( - timeline: &TimelineImpl, + timeline: &LayeredTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { @@ -161,7 +160,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo } fn local_timeline_info_from_repo_timeline( - repo_timeline: &RepositoryTimeline, + repo_timeline: &RepositoryTimeline, include_non_incremental_logical_size: bool, include_non_incremental_physical_size: bool, ) -> anyhow::Result { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 8e75cbd4a2..6bf2e71852 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -59,7 +59,9 @@ mod storage_layer; mod timeline; use storage_layer::Layer; -use timeline::{LayeredTimeline, LayeredTimelineEntry}; +use timeline::LayeredTimelineEntry; + +pub use timeline::LayeredTimeline; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 140260e0d0..47fd8a84cf 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -28,7 +28,6 @@ use tracing::info; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; -use layered_repository::LayeredRepository; use pgdatadir_mapping::DatadirTimeline; /// Current storage format version @@ -62,9 +61,6 @@ pub enum CheckpointConfig { Forced, } -pub type RepositoryImpl = LayeredRepository; -pub type TimelineImpl = ::Timeline; - pub fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint thread. This prevents new connections from // being accepted. diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index a1a08b11d5..d09b01437c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -412,7 +412,6 @@ pub mod repo_harness { use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; - use crate::RepositoryImpl; use crate::{ config::PageServerConf, layered_repository::LayeredRepository, @@ -508,11 +507,11 @@ pub mod repo_harness { }) } - pub fn load(&self) -> RepositoryImpl { + pub fn load(&self) -> LayeredRepository { self.try_load().expect("failed to load test repo") } - pub fn try_load(&self) -> Result { + pub fn try_load(&self) -> Result { let walredo_mgr = Arc::new(TestRedoManager); let repo = LayeredRepository::new( diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 5a5cea9a4b..d90cd7371a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -3,7 +3,7 @@ use crate::config::PageServerConf; use crate::http::models::TenantInfo; -use crate::layered_repository::{load_metadata, LayeredRepository}; +use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline}; use crate::repository::Repository; use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; @@ -12,7 +12,6 @@ use crate::thread_mgr::ThreadKind; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; -use crate::{RepositoryImpl, TimelineImpl}; use anyhow::Context; use serde::{Deserialize, Serialize}; use std::collections::hash_map::Entry; @@ -96,13 +95,13 @@ mod tenants_state { struct Tenant { state: TenantState, /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk. - repo: Arc, + repo: Arc, /// Timelines, located locally in the pageserver's datadir. /// Timelines can entirely be removed entirely by the `detach` operation only. /// /// Local timelines have more metadata that's loaded into memory, /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`]. - local_timelines: HashMap::Timeline>>, + local_timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -179,7 +178,7 @@ pub enum LocalTimelineUpdate { }, Attach { id: ZTenantTimelineId, - datadir: Arc<::Timeline>, + datadir: Arc, }, } @@ -369,7 +368,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow: Ok(()) } -pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { +pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result> { let m = tenants_state::read_tenants(); let tenant = m .get(&tenant_id) @@ -383,7 +382,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) @@ -488,9 +487,9 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any } fn load_local_timeline( - repo: &RepositoryImpl, + repo: &LayeredRepository, timeline_id: ZTimelineId, -) -> anyhow::Result> { +) -> anyhow::Result> { let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| { format!("Inmem timeline {timeline_id} not found in tenant's repository") })?; @@ -634,7 +633,7 @@ fn load_local_repo( conf: &'static PageServerConf, tenant_id: ZTenantId, remote_index: &RemoteIndex, -) -> anyhow::Result> { +) -> anyhow::Result> { let mut m = tenants_state::write_tenants(); let tenant = m.entry(tenant_id).or_insert_with(|| { // Set up a WAL redo manager, for applying WAL records. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 6002e8b2d9..c5b938c5fe 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -21,10 +21,13 @@ use utils::{ use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, - tenant_config::TenantConfOpt, RepositoryImpl, TimelineImpl, + tenant_config::TenantConfOpt, }; use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; +use crate::{ + layered_repository::{LayeredRepository, LayeredTimeline}, + walredo::WalRedoManager, +}; use crate::{repository::Timeline, CheckpointConfig}; #[derive(Debug, Clone, Copy)] @@ -73,7 +76,7 @@ pub fn create_repo( tenant_conf: TenantConfOpt, tenant_id: ZTenantId, create_repo: CreateRepo, -) -> Result> { +) -> Result> { let (wal_redo_manager, remote_index) = match create_repo { CreateRepo::Real { wal_redo_manager, @@ -223,7 +226,7 @@ pub(crate) fn create_timeline( new_timeline_id: Option, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, -) -> Result)>> { +) -> Result)>> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 2722bc7320..0f11a2197a 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -16,6 +16,7 @@ use std::{ time::Duration, }; +use crate::{layered_repository::LayeredTimeline, repository::Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use etcd_broker::{ @@ -25,12 +26,7 @@ use etcd_broker::{ use tokio::select; use tracing::*; -use crate::{ - exponential_backoff, - repository::{Repository, Timeline}, - DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, -}; -use crate::{RepositoryImpl, TimelineImpl}; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use utils::{ lsn::Lsn, zid::{NodeId, ZTenantTimelineId}, @@ -43,7 +39,7 @@ pub(super) fn spawn_connection_manager_task( id: ZTenantTimelineId, broker_loop_prefix: String, mut client: Client, - local_timeline: Arc, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, @@ -242,7 +238,7 @@ async fn subscribe_for_timeline_updates( struct WalreceiverState { id: ZTenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. - local_timeline: Arc, + local_timeline: Arc, /// The timeout on the connection to safekeeper for WAL streaming. wal_connect_timeout: Duration, /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. @@ -291,7 +287,7 @@ struct EtcdSkTimeline { impl WalreceiverState { fn new( id: ZTenantTimelineId, - local_timeline: Arc<::Timeline>, + local_timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, From 262cdf83442eea512a0a0a812017c73f5ce1df9b Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Wed, 17 Aug 2022 18:02:03 +0200 Subject: [PATCH 14/21] Update cachepot endpoint (#2290) * Update cachepot endpoint * Update dockerfile & remove env * Update image building process * Cannot use metadata endpoint for this * Update workflow * Conform to kaniko syntax * Update syntax * Update approach * Update dockerfiles * Force update * Update dockerfiles * Update dockerfile * Cleanup dockerfiles * Update s3 test location * Revert s3 experiment * Add more debug * Specify aws region * Remove debug, add prefix * Remove one more debug Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 16 +++++------- Dockerfile | 39 ++++++++++++++-------------- Dockerfile.compute-tools | 17 +++++++----- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 884187cec2..9b46d36c92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -436,11 +436,10 @@ jobs: neon-image: # force building for all 3 images - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -452,15 +451,14 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build neon run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID compute-tools-image: - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -469,15 +467,14 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID compute-node-image: - if: needs.dockerfile-check.outputs.value != 'true' + if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug - environment: dev steps: - name: Checkout @@ -489,7 +486,7 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build console + - name: Kaniko build compute node working-directory: ./vendor/postgres/ run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID @@ -512,7 +509,6 @@ jobs: runs-on: dev needs: [ promote-images, tag ] container: golang:1.19-bullseye - environment: dev steps: - name: Install Crane & ECR helper diff --git a/Dockerfile b/Dockerfile index 6f017ac5d4..1afaa41fb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ # Build Postgres -FROM neondatabase/rust:1.58 AS pg-build -WORKDIR /pg - -USER root +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build +WORKDIR /home/nonroot COPY vendor/postgres vendor/postgres COPY Makefile Makefile @@ -11,27 +9,30 @@ ENV BUILD_TYPE release RUN set -e \ && mold -run make -j $(nproc) -s postgres \ && rm -rf tmp_install/build \ - && tar -C tmp_install -czf /postgres_install.tar.gz . + && tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz . # Build zenith binaries -FROM neondatabase/rust:1.58 AS build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build +WORKDIR /home/nonroot ARG GIT_VERSION=local # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY -COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +COPY --from=pg-build /home/nonroot/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build --release \ + && mold -run cargo build --release \ && cachepot -s # Build final image @@ -40,8 +41,8 @@ FROM debian:bullseye-slim WORKDIR /data RUN set -e \ - && apt-get update \ - && apt-get install -y \ + && apt update \ + && apt install -y \ libreadline-dev \ libseccomp-dev \ openssl \ @@ -50,12 +51,12 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy /usr/local/bin -COPY --from=pg-build /pg/tmp_install/ /usr/local/ -COPY --from=pg-build /postgres_install.tar.gz /data/ +COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ +COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ COPY docker-entrypoint.sh /docker-entrypoint.sh diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index 76cbc2ac30..05393021c2 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,22 +1,25 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -FROM neondatabase/rust:1.58 AS rust-build +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build +WORKDIR /home/nonroot # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. # cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. ARG RUSTC_WRAPPER=cachepot -ARG CACHEPOT_BUCKET=zenith-rust-cachepot -ARG AWS_ACCESS_KEY_ID -ARG AWS_SECRET_ACCESS_KEY +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY COPY . . RUN set -e \ - && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \ + && mold -run cargo build -p compute_tools --release \ && cachepot -s # Final image that only has one binary -FROM debian:buster-slim +FROM debian:bullseye-slim -COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl From dc102197df0469da544463253e636395d1d33789 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 17 Aug 2022 17:16:26 +0100 Subject: [PATCH 15/21] workflows/benchmarking: increase timeout (#2294) --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index a6b2ca34e8..8080d6b7db 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -106,7 +106,7 @@ jobs: mkdir -p perf-report-staging # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file, # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400 - name: Submit result env: From 67e091c906a2863e32a8599d4f0aca90fc756b74 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 17 Aug 2022 23:24:47 +0300 Subject: [PATCH 16/21] Rework `init` in pageserver CLI (#2272) * Do not create initial tenant and timeline (adjust Python tests for that) * Rework config handling during init, add --update-config to manage local config updates --- Dockerfile | 3 - control_plane/src/local_env.rs | 10 +- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage.rs | 204 +++++++++--------- docker-entrypoint.sh | 24 --- neon_local/src/main.rs | 62 +++--- pageserver/src/bin/pageserver.rs | 180 +++++++++------- pageserver/src/tenant_mgr.rs | 7 +- pageserver/src/timelines.rs | 64 +----- pageserver/src/walredo.rs | 18 -- .../batch_others/test_pageserver_api.py | 45 +++- .../batch_others/test_tenant_relocation.py | 26 +-- 12 files changed, 285 insertions(+), 360 deletions(-) delete mode 100755 docker-entrypoint.sh diff --git a/Dockerfile b/Dockerfile index 1afaa41fb4..17aa0025e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,10 +58,7 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ -COPY docker-entrypoint.sh /docker-entrypoint.sh - VOLUME ["/data"] USER zenith EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["pageserver"] diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e0b409f32d..75e552f6cc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -24,7 +24,7 @@ use crate::safekeeper::SafekeeperNode; // This data structures represents neon_local CLI config // // It is deserialized from the .neon/config file, or the config file passed -// to 'zenith init --config=' option. See control_plane/simple.conf for +// to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // #[serde_as] @@ -320,7 +320,7 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'zenith init' first", + "Zenith config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } @@ -337,12 +337,12 @@ impl LocalEnv { } pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'zenith init --config=' + // Currently, the user first passes a config file with 'neon_local init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line +# and safekeeeper node. It is read by the 'neon_local' command-line # utility. "# .to_string(); @@ -382,7 +382,7 @@ impl LocalEnv { } // - // Initialize a new Zenith repository + // Initialize a new Neon repository // pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 3fda856e13..652736058a 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -51,7 +51,7 @@ impl ResponseErrorMessageExt for Response { Err(SafekeeperHttpError::Response( match self.json::() { Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {url}.", status.as_u16()), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }, )) } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 31858278d3..aab29628e3 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; use std::num::NonZeroU64; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::process::Command; use std::time::Duration; use std::{io, result, thread}; @@ -102,23 +102,19 @@ impl PageServerNode { /// Construct libpq connection string for connecting to the pageserver. fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { - format!("postgresql://no_user:{}@{}/no_db", password, listen_addr) + format!("postgresql://no_user:{password}@{listen_addr}/no_db") .parse() .unwrap() } - pub fn init( + pub fn initialize( &self, create_tenant: Option, initial_timeline_id: Option, config_overrides: &[&str], ) -> anyhow::Result { - let mut cmd = Command::new(self.env.pageserver_bin()?); - let id = format!("id={}", self.env.pageserver.id); - // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); @@ -138,67 +134,52 @@ impl PageServerNode { .collect::>() .join(",") ); - let mut args = Vec::with_capacity(20); - - args.push("--init"); - args.extend(["-D", &base_data_dir_param]); - args.extend(["-c", &pg_distrib_dir_param]); - args.extend(["-c", &authg_type_param]); - args.extend(["-c", &listen_http_addr_param]); - args.extend(["-c", &listen_pg_addr_param]); - args.extend(["-c", &broker_endpoints_param]); - args.extend(["-c", &id]); - let broker_etcd_prefix_param = self .env .etcd_broker .broker_etcd_prefix .as_ref() .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); - if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { - args.extend(["-c", broker_etcd_prefix_param]); - } - for config_override in config_overrides { - args.extend(["-c", config_override]); + let mut init_config_overrides = config_overrides.to_vec(); + init_config_overrides.push(&id); + init_config_overrides.push(&pg_distrib_dir_param); + init_config_overrides.push(&authg_type_param); + init_config_overrides.push(&listen_http_addr_param); + init_config_overrides.push(&listen_pg_addr_param); + init_config_overrides.push(&broker_endpoints_param); + + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + init_config_overrides.push(broker_etcd_prefix_param); } if self.env.pageserver.auth_type != AuthType::Trust { - args.extend([ - "-c", - "auth_validation_public_key_path='auth_public_key.pem'", - ]); + init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); } - let create_tenant = create_tenant.map(|id| id.to_string()); - if let Some(tenant_id) = create_tenant.as_deref() { - args.extend(["--create-tenant", tenant_id]) + self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?; + let init_result = self + .try_init_timeline(create_tenant, initial_timeline_id) + .context("Failed to create initial tenant and timeline for pageserver"); + match &init_result { + Ok(initial_timeline_id) => { + println!("Successfully initialized timeline {initial_timeline_id}") + } + Err(e) => eprintln!("{e:#}"), } + self.stop(false)?; + init_result + } - let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - let initial_timeline_id_string = initial_timeline_id.to_string(); - args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - - let cmd_with_args = cmd.args(args); - let init_output = fill_rust_env_vars(cmd_with_args) - .output() - .with_context(|| { - format!("failed to init pageserver with command {:?}", cmd_with_args) - })?; - - if !init_output.status.success() { - bail!( - "init invocation failed, {}\nStdout: {}\nStderr: {}", - init_output.status, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr) - ); - } - - // echo the captured output of the init command - println!("{}", String::from_utf8_lossy(&init_output.stdout)); - - Ok(initial_timeline_id) + fn try_init_timeline( + &self, + new_tenant_id: Option, + new_timeline_id: Option, + ) -> anyhow::Result { + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = + self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?; + Ok(initial_timeline_info.timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -210,15 +191,35 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - print!( + self.start_node(config_overrides, &self.repo_path(), false) + } + + fn start_node( + &self, + config_overrides: &[&str], + datadir: &Path, + update_config: bool, + ) -> anyhow::Result<()> { + println!( "Starting pageserver at '{}' in '{}'", connection_address(&self.pg_connection_config), - self.repo_path().display() + datadir.display() ); - io::stdout().flush().unwrap(); + io::stdout().flush()?; - let repo_path = self.repo_path(); - let mut args = vec!["-D", repo_path.to_str().unwrap()]; + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!( + "Datadir path '{}' cannot be represented as a unicode string", + datadir.display() + ) + })?, + ]; + + if update_config { + args.push("--update-config"); + } for config_override in config_overrides { args.extend(["-c", config_override]); @@ -230,8 +231,8 @@ impl PageServerNode { if !filled_cmd.status()?.success() { bail!( - "Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display() + "Pageserver failed to start. See console output and '{}' for details.", + datadir.join("pageserver.log").display() ); } @@ -240,7 +241,7 @@ impl PageServerNode { const RETRIES: i8 = 15; for retries in 1..RETRIES { match self.check_status() { - Ok(_) => { + Ok(()) => { println!("\nPageserver started"); return Ok(()); } @@ -254,21 +255,18 @@ impl PageServerNode { if retries == 5 { println!() // put a line break after dots for second message } - println!( - "Pageserver not responding yet, err {} retrying ({})...", - err, retries - ); + println!("Pageserver not responding yet, err {err} retrying ({retries})..."); } } PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {} ", msg) + bail!("pageserver failed to start: {msg} ") } } thread::sleep(Duration::from_secs(1)); } } } - bail!("pageserver failed to start in {} seconds", RETRIES); + bail!("pageserver failed to start in {RETRIES} seconds"); } /// @@ -298,15 +296,11 @@ impl PageServerNode { match kill(pid, sig) { Ok(_) => (), Err(Errno::ESRCH) => { - println!( - "Pageserver with pid {} does not exist, but a PID file was found", - pid - ); + println!("Pageserver with pid {pid} does not exist, but a PID file was found"); return Ok(()); } Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, + "Failed to send signal to pageserver with pid {pid}: {}", err.desc() ), } @@ -335,13 +329,13 @@ impl PageServerNode { thread::sleep(Duration::from_millis(100)); } - bail!("Failed to stop pageserver with pid {}", pid); + bail!("Failed to stop pageserver with pid {pid}"); } pub fn page_server_psql(&self, sql: &str) -> Vec { let mut client = self.pg_connection_config.connect(NoTls).unwrap(); - println!("Pageserver query: '{}'", sql); + println!("Pageserver query: '{sql}'"); client.simple_query(sql).unwrap() } @@ -376,9 +370,8 @@ impl PageServerNode { &self, new_tenant_id: Option, settings: HashMap<&str, &str>, - ) -> anyhow::Result> { - let tenant_id_string = self - .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + ) -> anyhow::Result { + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { new_tenant_id, checkpoint_distance: settings @@ -417,18 +410,16 @@ impl PageServerNode { }) .send()? .error_from_body()? - .json::>()?; - - tenant_id_string - .map(|id| { - id.parse().with_context(|| { - format!( - "Failed to parse tennat creation response as tenant id: {}", - id - ) + .json::>() + .with_context(|| { + format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") + })? + .context("No tenant id was found in the tenant creation response") + .and_then(|tenant_id_string| { + tenant_id_string.parse().with_context(|| { + format!("Failed to parse response string as tenant id: '{tenant_id_string}'") }) }) - .transpose() } pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> { @@ -499,22 +490,27 @@ impl PageServerNode { new_timeline_id: Option, ancestor_start_lsn: Option, ancestor_timeline_id: Option, - ) -> anyhow::Result> { - let timeline_info_response = self - .http_request( - Method::POST, - format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) -> anyhow::Result { + self.http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id, + ancestor_start_lsn, + ancestor_timeline_id, + }) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse timeline creation response for tenant id: {tenant_id}") + })? + .with_context(|| { + format!( + "No timeline id was found in the timeline creation response for tenant {tenant_id}" ) - .json(&TimelineCreateRequest { - new_timeline_id, - ancestor_start_lsn, - ancestor_timeline_id, - }) - .send()? - .error_from_body()? - .json::>()?; - - Ok(timeline_info_response) + }) } /// Import a basebackup prepared using either: diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh deleted file mode 100755 index 75dbdaed7a..0000000000 --- a/docker-entrypoint.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/sh -set -eux - -pageserver_id_param="${NODE_ID:-10}" - -broker_endpoints_param="${BROKER_ENDPOINT:-absent}" -if [ "$broker_endpoints_param" != "absent" ]; then - broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']" -else - broker_endpoints_param='' -fi - -remote_storage_param="${REMOTE_STORAGE:-}" - -if [ "$1" = 'pageserver' ]; then - if [ ! -d "/data/tenants" ]; then - echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param - fi - echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data -else - "$@" -fi diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index c4dd52e183..78a465539a 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -501,10 +501,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { // default_tenantid was generated by the `env.init()` call above let initial_tenant_id = env.default_tenant_id.unwrap(); - // Call 'pageserver init'. + // Initialize pageserver, create initial tenant and timeline. let pageserver = PageServerNode::from_env(&env); let initial_timeline_id = pageserver - .init( + .initialize( Some(initial_tenant_id), initial_timeline_id_arg, &pageserver_config_overrides(init_match), @@ -551,25 +551,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an .values_of("config") .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) .unwrap_or_default(); - let new_tenant_id = pageserver - .tenant_create(initial_tenant_id, tenant_conf)? - .ok_or_else(|| { - anyhow!("Tenant with id {:?} was already created", initial_tenant_id) - })?; - println!( - "tenant {} successfully created on the pageserver", - new_tenant_id - ); + let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + println!("tenant {new_tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant let new_timeline_id = parse_timeline_id(create_match)?; - let timeline = pageserver - .timeline_create(new_tenant_id, new_timeline_id, None, None)? - .context(format!( - "Failed to create initial timeline for tenant {new_tenant_id}" - ))?; - let new_timeline_id = timeline.timeline_id; - let last_record_lsn = timeline + let timeline_info = + pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?; + let new_timeline_id = timeline_info.timeline_id; + let last_record_lsn = timeline_info .local .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))? .last_record_lsn; @@ -616,20 +606,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let new_branch_name = create_match .value_of("branch-name") .ok_or_else(|| anyhow!("No branch name provided"))?; - let timeline = pageserver - .timeline_create(tenant_id, None, None, None)? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}", - timeline.timeline_id, last_record_lsn, tenant_id, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", + timeline_info.timeline_id ); } Some(("import", import_match)) => { @@ -680,10 +668,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - let ancestor_timeline_id = env .get_branch_timeline_id(ancestor_branch_name, tenant_id) .ok_or_else(|| { - anyhow!( - "Found no timeline id for branch name '{}'", - ancestor_branch_name - ) + anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") })?; let start_lsn = branch_match @@ -691,12 +676,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - .map(Lsn::from_str) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline = pageserver - .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? - .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; - let new_timeline_id = timeline.timeline_id; + let timeline_info = pageserver.timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + )?; + let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline + let last_record_lsn = timeline_info .local .expect("no local timeline info") .last_record_lsn; @@ -704,11 +692,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; println!( - "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", - timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name, + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", + timeline_info.timeline_id ); } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), None => bail!("no tenant subcommand provided"), } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b539964414..1a13147f42 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,6 @@ //! Main entry point for the Page Server executable. -use std::{env, path::Path, str::FromStr}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use tracing::*; use anyhow::{bail, Context, Result}; @@ -13,7 +13,7 @@ use pageserver::{ config::{defaults::*, PageServerConf}, http, page_cache, page_service, profiling, tenant_mgr, thread_mgr, thread_mgr::ThreadKind, - timelines, virtual_file, LOG_FILE_NAME, + virtual_file, LOG_FILE_NAME, }; use utils::{ auth::JwtAuth, @@ -24,7 +24,6 @@ use utils::{ shutdown::exit_now, signals::{self, Signal}, tcp_listener, - zid::{ZTenantId, ZTimelineId}, }; project_git_version!(GIT_VERSION); @@ -42,6 +41,7 @@ fn main() -> anyhow::Result<()> { .about("Materializes WAL stream to pages and serves them to the postgres") .version(&*version()) .arg( + Arg::new("daemonize") .short('d') .long("daemonize") @@ -52,7 +52,7 @@ fn main() -> anyhow::Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), + .help("Initialize pageserver with all given config overrides"), ) .arg( Arg::new("workdir") @@ -61,20 +61,6 @@ fn main() -> anyhow::Result<()> { .takes_value(true) .help("Working directory for the pageserver"), ) - .arg( - Arg::new("create-tenant") - .long("create-tenant") - .takes_value(true) - .help("Create tenant during init") - .requires("init"), - ) - .arg( - Arg::new("initial-timeline-id") - .long("initial-timeline-id") - .takes_value(true) - .help("Use a specific timeline id during init and tenant creation") - .requires("create-tenant"), - ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -85,6 +71,9 @@ fn main() -> anyhow::Result<()> { .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) + .arg(Arg::new("update-config").long("update-config").takes_value(false).help( + "Update the config file when started", + )) .arg( Arg::new("enabled-features") .long("enabled-features") @@ -110,18 +99,6 @@ fn main() -> anyhow::Result<()> { .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; let cfg_file_path = workdir.join("pageserver.toml"); - let init = arg_matches.is_present("init"); - let create_tenant = arg_matches - .value_of("create-tenant") - .map(ZTenantId::from_str) - .transpose() - .context("Failed to parse tenant id from the arguments")?; - let initial_timeline_id = arg_matches - .value_of("initial-timeline-id") - .map(ZTimelineId::from_str) - .transpose() - .context("Failed to parse timeline id from the arguments")?; - // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { format!( @@ -131,30 +108,86 @@ fn main() -> anyhow::Result<()> { })?; let daemonize = arg_matches.is_present("daemonize"); - if init && daemonize { - bail!("--daemonize cannot be used with --init") - } - let mut toml = if init { - // We're initializing the repo, so there's no config file yet - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")? - } else { - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) - .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; - cfg_file_contents - .parse::() - .with_context(|| { - format!( - "Failed to read '{}' as pageserver config", - cfg_file_path.display() - ) - })? + let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { + ControlFlow::Continue(conf) => conf, + ControlFlow::Break(()) => { + info!("Pageserver config init successful"); + return Ok(()); + } + }; + + let tenants_path = conf.tenants_path(); + if !tenants_path.exists() { + utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| { + format!( + "Failed to create tenants root dir at '{}'", + tenants_path.display() + ) + })?; + } + + // Initialize up failpoints support + let scenario = FailScenario::setup(); + + // Basic initialization of things that don't change after startup + virtual_file::init(conf.max_file_descriptors); + page_cache::init(conf.page_cache_size); + + start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + + scenario.teardown(); + Ok(()) +} + +fn initialize_config( + cfg_file_path: &Path, + arg_matches: clap::ArgMatches, + workdir: &Path, +) -> anyhow::Result> { + let init = arg_matches.is_present("init"); + let update_config = init || arg_matches.is_present("update-config"); + + let (mut toml, config_file_exists) = if cfg_file_path.is_file() { + if init { + anyhow::bail!( + "Config file '{}' already exists, cannot init it, use --update-config to update it", + cfg_file_path.display() + ); + } + // Supplement the CLI arguments with the config file + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + format!( + "Failed to read pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + ( + cfg_file_contents + .parse::() + .with_context(|| { + format!( + "Failed to parse '{}' as pageserver config", + cfg_file_path.display() + ) + })?, + true, + ) + } else if cfg_file_path.exists() { + anyhow::bail!( + "Config file '{}' exists but is not a regular file", + cfg_file_path.display() + ); + } else { + // We're initializing the repo, so there's no config file yet + ( + DEFAULT_CONFIG_FILE + .parse::() + .context("could not parse built-in config file")?, + false, + ) }; - // Process any extra options given with -c if let Some(values) = arg_matches.values_of("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -165,49 +198,38 @@ fn main() -> anyhow::Result<()> { })?; for (key, item) in doc.iter() { - if key == "id" { - anyhow::ensure!( - init, - "node id can only be set during pageserver init and cannot be overridden" - ); + if config_file_exists && update_config && key == "id" && toml.contains_key(key) { + anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display()); } toml.insert(key, item.clone()); } } } - trace!("Resulting toml: {}", toml); - let conf = PageServerConf::parse_and_validate(&toml, &workdir) + + debug!("Resulting toml: {toml}"); + let conf = PageServerConf::parse_and_validate(&toml, workdir) .context("Failed to parse pageserver configuration")?; - // The configuration is all set up now. Turn it into a 'static - // that can be freely stored in structs and passed across threads - // as a ref. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + if update_config { + info!("Writing pageserver config to '{}'", cfg_file_path.display()); - // Initialize up failpoints support - let scenario = FailScenario::setup(); - - // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); - page_cache::init(conf.page_cache_size); - - // Create repo and exit if init was requested - if init { - timelines::init_pageserver(conf, create_tenant, initial_timeline_id) - .context("Failed to init pageserver")?; - // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( - "Failed to initialize pageserver config at '{}'", + "Failed to write pageserver config to '{}'", cfg_file_path.display() ) })?; - } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver")?; + info!( + "Config successfully written to '{}'", + cfg_file_path.display() + ) } - scenario.teardown(); - Ok(()) + Ok(if init { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(Box::leak(Box::new(conf))) + }) } fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d90cd7371a..64f1caa542 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,7 +9,6 @@ use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; use crate::thread_mgr::ThreadKind; -use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; use crate::{thread_mgr, timelines, walreceiver}; use anyhow::Context; @@ -284,10 +283,8 @@ pub fn create_tenant_repository( conf, tenant_conf, tenant_id, - CreateRepo::Real { - wal_redo_manager, - remote_index, - }, + wal_redo_manager, + remote_index, )?; v.insert(Tenant { state: TenantState::Idle, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index c5b938c5fe..ed5975d3bd 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -13,17 +13,17 @@ use std::{ use tracing::*; use utils::{ - crashsafe_dir, logging, + crashsafe_dir, lsn::Lsn, zid::{ZTenantId, ZTimelineId}, }; +use crate::import_datadir; use crate::tenant_mgr; use crate::{ config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt, }; -use crate::{import_datadir, LOG_FILE_NAME}; use crate::{ layered_repository::{LayeredRepository, LayeredTimeline}, walredo::WalRedoManager, @@ -36,69 +36,13 @@ pub struct PointInTime { pub lsn: Lsn, } -pub fn init_pageserver( - conf: &'static PageServerConf, - create_tenant: Option, - initial_timeline_id: Option, -) -> anyhow::Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - if let Some(tenant_id) = create_tenant { - println!("initializing tenantid {}", tenant_id); - let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy) - .context("failed to create repo")?; - let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); - bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) - .context("failed to create initial timeline")?; - println!("initial timeline {} created", new_timeline_id) - } else if initial_timeline_id.is_some() { - println!("Ignoring initial timeline parameter, due to no tenant id to create given"); - } - - println!("pageserver init succeeded"); - Ok(()) -} - -pub enum CreateRepo { - Real { - wal_redo_manager: Arc, - remote_index: RemoteIndex, - }, - Dummy, -} - pub fn create_repo( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: ZTenantId, - create_repo: CreateRepo, + wal_redo_manager: Arc, + remote_index: RemoteIndex, ) -> Result> { - let (wal_redo_manager, remote_index) = match create_repo { - CreateRepo::Real { - wal_redo_manager, - remote_index, - } => (wal_redo_manager, remote_index), - CreateRepo::Dummy => { - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - - (wal_redo_manager as _, RemoteIndex::default()) - } - }; - let repo_dir = conf.tenant_path(&tenant_id); ensure!( !repo_dir.exists(), diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 85f970a941..57817dbc9c 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -82,24 +82,6 @@ pub trait WalRedoManager: Send + Sync { ) -> Result; } -/// -/// A dummy WAL Redo Manager implementation that doesn't allow replaying -/// anything. Currently used during bootstrapping (zenith init), to create -/// a Repository object without launching the real WAL redo process. -/// -pub struct DummyRedoManager {} -impl crate::walredo::WalRedoManager for DummyRedoManager { - fn request_redo( - &self, - _key: Key, - _lsn: Lsn, - _base_img: Option, - _records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - Err(WalRedoError::InvalidState) - } -} - // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 51df41699a..710b220ae8 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,7 +1,11 @@ from typing import Optional from uuid import uuid4, UUID import pytest +import pathlib +import os +import subprocess from fixtures.utils import lsn_from_hex +from fixtures.log_helper import log from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -9,16 +13,43 @@ from fixtures.neon_fixtures import ( NeonPageserverHttpClient, NeonPageserverApiException, wait_until, + neon_binpath, + pg_distrib_dir, ) -# test that we cannot override node id -def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): - env = neon_env_builder.init() - with pytest.raises( - Exception, - match="node id can only be set during pageserver init and cannot be overridden"): - env.pageserver.start(overrides=['--pageserver-config-override=id=10']) +# test that we cannot override node id after init +def test_pageserver_init_node_id(neon_simple_env: NeonEnv): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / 'pageserver.toml' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' + run_pageserver = lambda args: subprocess.run([str(pageserver_bin), '-D', str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # remove initial config + pageserver_config.unlink() + + bad_init = run_pageserver(['--init', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_init.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), 'config file should not be created after init error' + + completed_init = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert completed_init.returncode == 0, 'pageserver should be able to create a new config with the node id given' + assert pageserver_config.exists(), 'config file should be created successfully' + + bad_reinit = run_pageserver( + ['--init', '-c', 'id = 12345', '-c', f'pg_distrib_dir="{pg_distrib_dir}"']) + assert bad_reinit.returncode == 1, 'pageserver should not be able to init new config without the node id' + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(['--update-config', '-c', 'id = 3']) + assert bad_update.returncode == 1, 'pageserver should not allow updating node id' + assert "has node id already, it cannot be overridden" in bad_update.stderr def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 176ca740fe..eb65e2e3b5 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -44,30 +44,22 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ - cmd = [ - str(pageserver_bin), - '--init', - '--workdir', - str(new_pageserver_dir), - f"-c listen_pg_addr='localhost:{pg_port}'", - f"-c listen_http_addr='localhost:{http_port}'", - f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c id=2", - f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - ] - - if broker is not None: - cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) - - subprocess.check_output(cmd, text=True) - # actually run new pageserver cmd = [ str(pageserver_bin), '--workdir', str(new_pageserver_dir), '--daemonize', + '--update-config', + f"-c listen_pg_addr='localhost:{pg_port}'", + f"-c listen_http_addr='localhost:{http_port}'", + f"-c pg_distrib_dir='{pg_distrib_dir}'", + f"-c id=2", + f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", ] + if broker is not None: + cmd.append(f"-c broker_endpoints=['{broker.client_url()}']", ) + log.info("starting new pageserver %s", cmd) out = subprocess.check_output(cmd, text=True) log.info("started new pageserver %s", out) From 92bdf04758f6c024737927a4db76769458c8f091 Mon Sep 17 00:00:00 2001 From: Rory de Zoete <33318916+zoete@users.noreply.github.com> Date: Thu, 18 Aug 2022 09:41:24 +0200 Subject: [PATCH 17/21] Fix: Always build images (#2296) * Always build images * Remove unused Co-authored-by: Rory de Zoete --- .github/workflows/build_and_test.yml | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9b46d36c92..6b76b6e5fc 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -415,30 +415,8 @@ jobs: } }" - dockerfile-check: - if: github.event_name != 'workflow_dispatch' - runs-on: dev - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest - outputs: - value: ${{ steps.dockerfile-check.outputs.any_changed }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Get specific changed files - id: dockerfile-check - uses: tj-actions/changed-files@802732316a11c01531ea72773ec7998155238e31 # v25 - with: - files: | - Dockerfile - Dockerfile.compute-tools - ./vendor/postgres/Dockerfile - neon-image: - # force building for all 3 images - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -455,9 +433,7 @@ jobs: run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID compute-tools-image: - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: @@ -471,9 +447,7 @@ jobs: run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID compute-node-image: - if: needs.dockerfile-check.outputs.value == 'true' runs-on: dev - needs: [ dockerfile-check ] container: gcr.io/kaniko-project/executor:v1.9.0-debug steps: From 9bc12f7444e8a574d4ac5cfe2b213a91390342c8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 26 Jul 2022 13:45:28 +0300 Subject: [PATCH 18/21] Move auto-generated 'bindings' to a separate inner module. Re-export only things that are used by other modules. In the future, I'm imagining that we run bindgen twice, for Postgres v14 and v15. The two sets of bindings would go into separate 'bindings_v14' and 'bindings_v15' modules. Rearrange postgres_ffi modules. Move function, to avoid Postgres version dependency in timelines.rs Move function to generate a logical-message WAL record to postgres_ffi. --- libs/postgres_ffi/src/controlfile_utils.rs | 2 +- libs/postgres_ffi/src/lib.rs | 59 +++++++-- libs/postgres_ffi/src/nonrelfile_utils.rs | 5 +- libs/postgres_ffi/src/pg_constants.rs | 9 +- libs/postgres_ffi/src/relfile_utils.rs | 2 +- libs/postgres_ffi/src/waldecoder.rs | 5 +- libs/postgres_ffi/src/xlog_utils.rs | 113 ++++++++++++++++-- libs/postgres_ffi/wal_craft/src/lib.rs | 4 +- pageserver/src/basebackup.rs | 23 ++-- pageserver/src/import_datadir.rs | 32 +++-- pageserver/src/keyspace.rs | 4 +- pageserver/src/layered_repository/timeline.rs | 2 +- pageserver/src/page_cache.rs | 2 +- pageserver/src/page_service.rs | 10 +- pageserver/src/pgdatadir_mapping.rs | 18 +-- pageserver/src/reltag.rs | 5 +- pageserver/src/timelines.rs | 14 +-- pageserver/src/walingest.rs | 16 +-- .../src/walreceiver/walreceiver_connection.rs | 2 +- pageserver/src/walrecord.rs | 17 ++- pageserver/src/walredo.rs | 21 ++-- safekeeper/src/handler.rs | 2 +- safekeeper/src/json_ctrl.rs | 97 +-------------- safekeeper/src/metrics.rs | 2 +- safekeeper/src/safekeeper.rs | 5 +- safekeeper/src/send_wal.rs | 2 +- safekeeper/src/timeline.rs | 3 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_storage.rs | 10 +- 29 files changed, 265 insertions(+), 224 deletions(-) diff --git a/libs/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs index 4df2342b90..0918d15001 100644 --- a/libs/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -23,7 +23,7 @@ //! information. You can use PostgreSQL's pg_controldata utility to view its //! contents. //! -use crate::{ControlFileData, PG_CONTROL_FILE_SIZE}; +use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE}; use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 28d9a13dbf..022355329c 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -7,21 +7,62 @@ // https://github.com/rust-lang/rust-bindgen/issues/1651 #![allow(deref_nullptr)] -use serde::{Deserialize, Serialize}; use utils::lsn::Lsn; -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +macro_rules! postgres_ffi { + ($version:ident) => { + #[path = "."] + pub mod $version { + // fixme: does this have to be 'pub'? + pub mod bindings { + // bindgen generates bindings for a lot of stuff we don't need + #![allow(dead_code)] -pub mod controlfile_utils; -pub mod nonrelfile_utils; -pub mod pg_constants; -pub mod relfile_utils; -pub mod waldecoder; -pub mod xlog_utils; + use serde::{Deserialize, Serialize}; + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + } + pub mod controlfile_utils; + pub mod nonrelfile_utils; + pub mod pg_constants; + pub mod relfile_utils; + pub mod waldecoder; + pub mod xlog_utils; + + // Re-export some symbols from bindings + pub use bindings::DBState_DB_SHUTDOWNED; + pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + } + }; +} + +postgres_ffi!(v14); + +// Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::{uint32, uint64, Oid}; +pub use v14::bindings::{BlockNumber, OffsetNumber}; +pub use v14::bindings::{MultiXactId, TransactionId}; + +// Likewise for these, although the assumption that these don't change is a little more iffy. +pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; + +// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and +// --with-segsize=SEGSIZE, but assume the defaults for now. +pub const BLCKSZ: u16 = 8192; +pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); +pub const XLOG_BLCKSZ: usize = 8192; + +// PG timeline is always 1, changing it doesn't have any useful meaning in Neon. +// +// NOTE: this is not to be confused with Neon timelines; different concept! +// +// It's a shaky assumption, that it's always 1. We might import a +// PostgreSQL data directory that has gone through timeline bumps, +// for example. FIXME later. +pub const PG_TLI: u32 = 1; // See TransactionIdIsNormal in transam.h pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > pg_constants::FIRST_NORMAL_TRANSACTION_ID + id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID } // See TransactionIdPrecedes in transam.c diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index b92207cd81..04ef346d88 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,11 +1,12 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::{pg_constants, transaction_id_precedes}; +use crate::transaction_id_precedes; +use super::pg_constants; use bytes::BytesMut; use log::*; -use crate::MultiXactId; +use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { trace!( diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 7230b841f5..42b5c5d842 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -7,7 +7,8 @@ //! comments on them. //! -use crate::PageHeaderData; +use super::bindings::PageHeaderData; +use crate::BLCKSZ; // // From pg_tablespace_d.h @@ -31,11 +32,6 @@ pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; -// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and -// --with-segsize=SEGSIZE, but assume the defaults for now. -pub const BLCKSZ: u16 = 8192; -pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); - // // From bufpage.h // @@ -213,7 +209,6 @@ pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; /* FIXME: pageserver should request wal_seg_size from compute node */ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; -pub const XLOG_BLCKSZ: usize = 8192; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; diff --git a/libs/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs index cc9d6470c0..f3476acc9c 100644 --- a/libs/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,7 +1,7 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use crate::pg_constants; +use super::pg_constants; use once_cell::sync::OnceCell; use regex::Regex; diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index cbb761236c..0e1c9567cb 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -10,10 +10,7 @@ //! use super::pg_constants; use super::xlog_utils::*; -use super::XLogLongPageHeaderData; -use super::XLogPageHeaderData; -use super::XLogRecord; -use super::XLOG_PAGE_MAGIC; +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use crc32c::*; use log::*; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 956f53ce85..e7838c3f2c 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -7,22 +7,24 @@ // have been named the same as the corresponding PostgreSQL functions instead. // -use crate::pg_constants; -use crate::CheckPoint; -use crate::FullTransactionId; -use crate::XLogLongPageHeaderData; -use crate::XLogPageHeaderData; -use crate::XLogRecord; -use crate::XLOG_PAGE_MAGIC; +use crc32c::crc32c_append; -use crate::pg_constants::WAL_SEGMENT_SIZE; -use crate::waldecoder::WalStreamDecoder; +use super::bindings::{ + CheckPoint, FullTransactionId, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, + XLOG_PAGE_MAGIC, +}; +use super::pg_constants; +use super::pg_constants::WAL_SEGMENT_SIZE; +use crate::v14::waldecoder::WalStreamDecoder; +use crate::PG_TLI; +use crate::{uint32, uint64, Oid}; use bytes::BytesMut; use bytes::{Buf, Bytes}; use log::*; +use serde::Serialize; use std::fs::File; use std::io::prelude::*; use std::io::ErrorKind; @@ -47,9 +49,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); #[allow(clippy::identity_op)] pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; -// PG timeline is always 1, changing it doesn't have useful meaning in Zenith. -pub const PG_TLI: u32 = 1; - pub type XLogRecPtr = u64; pub type TimeLineID = u32; pub type TimestampTz = i64; @@ -346,6 +345,85 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result Bytes { + use utils::bin_ser::LeSer; + self.ser().unwrap().into() + } +} + +/// Create new WAL record for non-transactional logical message. +/// Used for creating artificial WAL for tests, as LogicalMessage +/// record is basically no-op. +/// +/// NOTE: This leaves the xl_prev field zero. The safekeeper and +/// pageserver tolerate that, but PostgreSQL does not. +pub fn encode_logical_message(prefix: &str, message: &str) -> Vec { + let mut prefix_bytes: Vec = Vec::with_capacity(prefix.len() + 1); + prefix_bytes.write_all(prefix.as_bytes()).unwrap(); + prefix_bytes.push(0); + + let message_bytes = message.as_bytes(); + + let logical_message = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as u64, + message_size: message_bytes.len() as u64, + }; + + let mainrdata = logical_message.encode(); + let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); + // only short mainrdata is supported for now + assert!(mainrdata_len <= 255); + let mainrdata_len = mainrdata_len as u8; + + let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; + data.extend_from_slice(&mainrdata); + data.extend_from_slice(&prefix_bytes); + data.extend_from_slice(message_bytes); + + let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len(); + + let mut header = XLogRecord { + xl_tot_len: total_len as u32, + xl_xid: 0, + xl_prev: 0, + xl_info: 0, + xl_rmid: 21, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: 0, // crc will be calculated later + }; + + let header_bytes = header.encode().expect("failed to encode header"); + let crc = crc32c_append(0, &data); + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut wal: Vec = Vec::new(); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); + wal.extend_from_slice(&data); + + // WAL start position must be aligned at 8 bytes, + // this will add padding for the next WAL record. + const PADDING: usize = 8; + let padding_rem = wal.len() % PADDING; + if padding_rem != 0 { + wal.resize(wal.len() + PADDING - padding_rem, 0); + } + + wal +} + #[cfg(test)] mod tests { use super::*; @@ -547,4 +625,15 @@ mod tests { checkpoint.update_next_xid(1024); assert_eq!(checkpoint.nextXid.value, 2048); } + + #[test] + pub fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, + 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix", "message"); + assert_eq!(expected, actual[..]); + } } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index e3b666da41..6ac5afb27f 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,8 @@ use log::*; use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; -use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE; -use postgres_ffi::xlog_utils::{ +use postgres_ffi::v14::pg_constants::WAL_SEGMENT_SIZE; +use postgres_ffi::v14::xlog_utils::{ XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; use std::cmp::Ordering; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 5837447ce8..33f072553f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -24,8 +24,13 @@ use tracing::*; use crate::reltag::{RelTag, SlruKind}; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::*; -use postgres_ffi::*; + +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName}; +use postgres_ffi::v14::{CheckPoint, ControlFileData}; +use postgres_ffi::TransactionId; +use postgres_ffi::PG_TLI; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE}; use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, @@ -200,7 +205,7 @@ where } // Add a file for each chunk of blocks (aka segment) - let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize); + let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); for (seg, blocks) in chunks.into_iter().enumerate() { let mut segment_data: Vec = vec![]; for blknum in blocks { @@ -220,23 +225,19 @@ where fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; - let mut slru_buf: Vec = - Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); + let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { let img = self .timeline .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; if slru == SlruKind::Clog { - ensure!( - img.len() == pg_constants::BLCKSZ as usize - || img.len() == pg_constants::BLCKSZ as usize + 8 - ); + ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); } else { - ensure!(img.len() == pg_constants::BLCKSZ as usize); + ensure!(img.len() == BLCKSZ as usize); } - slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]); + slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 7d1e8e43aa..729829c5e8 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -15,13 +15,24 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; -use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::*; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::relfile_utils::*; +use postgres_ffi::v14::waldecoder::*; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; use postgres_ffi::Oid; -use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; +// Returns checkpoint LSN from controlfile +pub fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} + /// /// Import all relation data pages from local disk into the repository. /// @@ -110,8 +121,8 @@ fn import_rel( let mut buf: [u8; 8192] = [0u8; 8192]; - ensure!(len % pg_constants::BLCKSZ as usize == 0); - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); + let nblocks = len / BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, @@ -120,7 +131,7 @@ fn import_rel( forknum, }; - let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32); // Call put_rel_creation for every segment of the relation, // because there is no guarantee about the order in which we are processing segments. @@ -144,8 +155,7 @@ fn import_rel( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - let relative_blknum = - blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32); ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } @@ -184,8 +194,8 @@ fn import_slru( .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; - ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ - let nblocks = len / pg_constants::BLCKSZ as usize; + ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / BLCKSZ as usize; ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index da213704f3..64024a2d8d 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -1,5 +1,5 @@ use crate::repository::{key_range_size, singleton_range, Key}; -use postgres_ffi::pg_constants; +use postgres_ffi::BLCKSZ; use std::ops::Range; /// @@ -19,7 +19,7 @@ impl KeySpace { /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as usize; let mut parts = Vec::new(); let mut current_part = Vec::new(); diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index e27619cc83..6ef4915bdb 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -45,7 +45,7 @@ use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; use crate::DatadirTimeline; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; use utils::{ lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 716df0f749..818eaf1b8f 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -83,7 +83,7 @@ pub fn get() -> &'static PageCache { } } -pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize; +pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize; const MAX_USAGE_COUNT: u8 = 5; /// diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3c5ea5267e..b63bb90be1 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -40,9 +40,10 @@ use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; -use postgres_ffi::xlog_utils::to_pg_timestamp; +use postgres_ffi::v14::xlog_utils::to_pg_timestamp; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::BLCKSZ; // Wrapped in libpq CopyData enum PagestreamFeMessage { @@ -725,10 +726,9 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let total_blocks = - timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; + let total_blocks = timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64; + let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { db_size, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 113f40302a..88fac0ad5a 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,8 +13,10 @@ use crate::repository::*; use crate::walrecord::ZenithWalRecord; use anyhow::{bail, ensure, Result}; use bytes::{Buf, Bytes}; -use postgres_ffi::xlog_utils::TimestampTz; -use postgres_ffi::{pg_constants, Oid, TransactionId}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::TimestampTz; +use postgres_ffi::BLCKSZ; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; @@ -297,9 +299,9 @@ pub trait DatadirTimeline: Timeline { let clog_page = self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; - if clog_page.len() == pg_constants::BLCKSZ as usize + 8 { + if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; - timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]); + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); if timestamp >= search_timestamp { @@ -382,7 +384,7 @@ pub trait DatadirTimeline: Timeline { total_size += relsize as usize; } } - Ok(total_size * pg_constants::BLCKSZ as usize) + Ok(total_size * BLCKSZ as usize) } /// @@ -912,7 +914,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { result?; if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); self.pending_nblocks = 0; } @@ -940,7 +942,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> { writer.finish_write(lsn); if pending_nblocks != 0 { - writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize); + writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize); } Ok(()) @@ -1014,7 +1016,7 @@ struct SlruSegmentDirectory { segments: HashSet, } -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]); +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); // Layout of the Key address space // diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index fadd41f547..e3d08f8b3d 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -2,8 +2,9 @@ use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; -use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{pg_constants, Oid}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::relfile_utils::forknumber_to_name; +use postgres_ffi::Oid; /// /// Relation data file segment id throughout the Postgres cluster. diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index ed5975d3bd..0d35195691 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -3,7 +3,7 @@ // use anyhow::{bail, ensure, Context, Result}; -use postgres_ffi::ControlFileData; + use std::{ fs, path::Path, @@ -69,16 +69,6 @@ pub fn create_repo( ))) } -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - // Create the cluster temporarily in 'initdbpath' directory inside the repository // to get bootstrap data for timeline initialization. // @@ -128,7 +118,7 @@ fn bootstrap_timeline( run_initdb(conf, &initdb_path)?; let pgdata_path = initdb_path; - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); + let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); // Import the contents of the data directory at the initial checkpoint // LSN, and any WAL after that. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index b8064849e0..1b046b9f33 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -22,8 +22,8 @@ //! bespoken Rust code. use anyhow::Context; -use postgres_ffi::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; +use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; @@ -33,10 +33,12 @@ use tracing::*; use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::walrecord::*; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; -use postgres_ffi::{pg_constants, CheckPoint}; +use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); @@ -293,7 +295,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { // Extract page image from FPI record let img_len = blk.bimg_len as usize; let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize); + let mut image = BytesMut::with_capacity(BLCKSZ as usize); image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); if blk.hole_length != 0 { @@ -309,7 +311,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> { if !page_is_new(&image) { page_set_lsn(&mut image, lsn) } - assert_eq!(image.len(), pg_constants::BLCKSZ as usize); + assert_eq!(image.len(), BLCKSZ as usize); self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { let rec = ZenithWalRecord::Postgres { diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 16a1f232e3..0688086117 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -27,7 +27,7 @@ use crate::{ walingest::WalIngest, walrecord::DecodedWALRecord, }; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; /// Status of the connection. diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 6b01d52005..c56b1c6c0c 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -3,9 +3,10 @@ //! use anyhow::Result; use bytes::{Buf, Bytes}; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; -use postgres_ffi::XLogRecord; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::v14::XLogRecord; +use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, OffsetNumber}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; use serde::{Deserialize, Serialize}; @@ -618,7 +619,7 @@ pub fn decode_wal_record( blk.hole_length = 0; } } else { - blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len; + blk.hole_length = BLCKSZ - blk.bimg_len; } datatotal += blk.bimg_len as u32; blocks_total_len += blk.bimg_len as u32; @@ -628,9 +629,7 @@ pub fn decode_wal_record( * bimg_len < BLCKSZ if the HAS_HOLE flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 - && (blk.hole_offset == 0 - || blk.hole_length == 0 - || blk.bimg_len == pg_constants::BLCKSZ) + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) { // TODO /* @@ -667,7 +666,7 @@ pub fn decode_wal_record( * flag is set. */ if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == pg_constants::BLCKSZ + && blk.bimg_len == BLCKSZ { // TODO /* @@ -685,7 +684,7 @@ pub fn decode_wal_record( */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 - && blk.bimg_len != pg_constants::BLCKSZ + && blk.bimg_len != BLCKSZ { // TODO /* diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 57817dbc9c..9cf347573a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -44,11 +44,12 @@ use crate::reltag::{RelTag, SlruKind}; use crate::repository::Key; use crate::walrecord::ZenithWalRecord; use metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; -use postgres_ffi::nonrelfile_utils::transaction_id_set_status; -use postgres_ffi::pg_constants; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::BLCKSZ; /// /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. @@ -417,10 +418,10 @@ impl PostgresRedoManager { } // Append the timestamp - if page.len() == pg_constants::BLCKSZ as usize + 8 { - page.truncate(pg_constants::BLCKSZ as usize); + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); } - if page.len() == pg_constants::BLCKSZ as usize { + if page.len() == BLCKSZ as usize { page.extend_from_slice(×tamp.to_be_bytes()); } else { warn!( @@ -741,7 +742,7 @@ impl PostgresRedoProcess { // We expect the WAL redo process to respond with an 8k page image. We read it // into this buffer. - let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()]; + let mut resultbuf = vec![0; BLCKSZ.into()]; let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far // Prepare for calling poll() @@ -754,7 +755,7 @@ impl PostgresRedoProcess { // We do three things simultaneously: send the old base image and WAL records to // the child process's stdin, read the result from child's stdout, and forward any logging // information that the child writes to its stderr to the page server's log. - while nresult < pg_constants::BLCKSZ.into() { + while nresult < BLCKSZ.into() { // If we have more data to write, wake up if 'stdin' becomes writeable or // we have data to read. Otherwise only wake up if there's data to read. let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index a8121e829e..63bc9bd517 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -9,7 +9,7 @@ use crate::timeline::{Timeline, TimelineTools}; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::PG_TLI; +use postgres_ffi::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 97fb3654d2..3f84e7b183 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -7,8 +7,7 @@ //! use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; -use crc32c::crc32c_append; +use bytes::Bytes; use serde::{Deserialize, Serialize}; use tracing::*; @@ -19,9 +18,8 @@ use crate::safekeeper::{ }; use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; use crate::timeline::TimelineTools; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils; -use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; +use postgres_ffi::v14::pg_constants; +use postgres_ffi::v14::xlog_utils; use utils::{ lsn::Lsn, postgres_backend::PostgresBackend, @@ -144,7 +142,7 @@ fn append_logical_message( spg: &mut SafekeeperPostgresHandler, msg: &AppendLogicalMessage, ) -> Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message); let sk_state = spg.timeline.get().get_state().1; let begin_lsn = msg.begin_lsn; @@ -182,90 +180,3 @@ fn append_logical_message( append_response, }) } - -#[repr(C)] -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -struct XlLogicalMessage { - db_id: Oid, - transactional: uint32, // bool, takes 4 bytes due to alignment in C structures - prefix_size: uint64, - message_size: uint64, -} - -impl XlLogicalMessage { - pub fn encode(&self) -> Bytes { - use utils::bin_ser::LeSer; - self.ser().unwrap().into() - } -} - -/// Create new WAL record for non-transactional logical message. -/// Used for creating artificial WAL for tests, as LogicalMessage -/// record is basically no-op. -fn encode_logical_message(prefix: &str, message: &str) -> Vec { - let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1); - prefix_bytes.put(prefix.as_bytes()); - prefix_bytes.put_u8(0); - - let message_bytes = message.as_bytes(); - - let logical_message = XlLogicalMessage { - db_id: 0, - transactional: 0, - prefix_size: prefix_bytes.len() as u64, - message_size: message_bytes.len() as u64, - }; - - let mainrdata = logical_message.encode(); - let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); - // only short mainrdata is supported for now - assert!(mainrdata_len <= 255); - let mainrdata_len = mainrdata_len as u8; - - let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; - data.extend_from_slice(&mainrdata); - data.extend_from_slice(&prefix_bytes); - data.extend_from_slice(message_bytes); - - let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len(); - - let mut header = XLogRecord { - xl_tot_len: total_len as u32, - xl_xid: 0, - xl_prev: 0, - xl_info: 0, - xl_rmid: 21, - __bindgen_padding_0: [0u8; 2usize], - xl_crc: 0, // crc will be calculated later - }; - - let header_bytes = header.encode().expect("failed to encode header"); - let crc = crc32c_append(0, &data); - let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); - header.xl_crc = crc; - - let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode().expect("failed to encode header")); - wal.extend_from_slice(&data); - - // WAL start position must be aligned at 8 bytes, - // this will add padding for the next WAL record. - const PADDING: usize = 8; - let padding_rem = wal.len() % PADDING; - if padding_rem != 0 { - wal.resize(wal.len() + PADDING - padding_rem, 0); - } - - wal -} - -#[test] -fn test_encode_logical_message() { - let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, - 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, - 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, - ]; - let actual = encode_logical_message("prefix", "message"); - assert_eq!(expected, actual[..]); -} diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index fe4f9d231c..648f0634f8 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -7,7 +7,7 @@ use metrics::{ proto::MetricFamily, Gauge, IntGaugeVec, }; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::{ diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 88747f14e5..22f8ca2de4 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,9 +5,7 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use etcd_broker::subscription_value::SkTimelineInfo; -use postgres_ffi::xlog_utils::TimeLineID; - -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -19,7 +17,6 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, lsn::Lsn, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 4a9c56859f..243d7bf7d0 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -6,7 +6,7 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools}; use crate::wal_storage::WalReader; use anyhow::{bail, Context, Result}; -use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; +use postgres_ffi::v14::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 161fca3595..3a10c5d59e 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,8 +4,9 @@ use anyhow::{bail, Context, Result}; use etcd_broker::subscription_value::SkTimelineInfo; + use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::XLogSegNo; +use postgres_ffi::v14::xlog_utils::XLogSegNo; use serde::Serialize; use tokio::sync::watch; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index b2f9d8d4f3..3552452470 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -11,7 +11,8 @@ use std::pin::Pin; use std::sync::Arc; use std::time::Duration; -use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use postgres_ffi::v14::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr}; +use postgres_ffi::PG_TLI; use remote_storage::{GenericRemoteStorage, RemoteStorage}; use tokio::fs::File; use tokio::runtime::Builder; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 5f4bf588c7..6a45ae1411 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -13,9 +13,10 @@ use std::pin::Pin; use tokio::io::AsyncRead; use once_cell::sync::Lazy; -use postgres_ffi::xlog_utils::{ - find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, PG_TLI, +use postgres_ffi::v14::xlog_utils::{ + find_end_of_wal, IsPartialXLogFileName, IsXLogFileName, XLogFromFileName, XLogSegNo, }; +use postgres_ffi::PG_TLI; use std::cmp::min; use std::fs::{self, remove_file, File, OpenOptions}; @@ -30,9 +31,10 @@ use crate::safekeeper::SafeKeeperState; use crate::wal_backup::read_object; use crate::SafeKeeperConf; -use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; +use postgres_ffi::v14::xlog_utils::XLogFileName; +use postgres_ffi::XLOG_BLCKSZ; -use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::v14::waldecoder::WalStreamDecoder; use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; From 1a07ddae5fa5b2096a5a356b8df6bb8d2bc30896 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 18 Aug 2022 00:25:15 +0300 Subject: [PATCH 19/21] fix cargo test --- pageserver/src/walingest.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1b046b9f33..05afe4ba3e 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1035,7 +1035,8 @@ mod tests { use crate::pgdatadir_mapping::create_test_timeline; use crate::repository::repo_harness::*; use crate::repository::Timeline; - use postgres_ffi::pg_constants; + use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; + use postgres_ffi::RELSEG_SIZE; /// Arbitrary relation tag, for testing. const TESTREL_A: RelTag = RelTag { @@ -1324,7 +1325,7 @@ mod tests { let mut walingest = init_walingest_test(&*tline)?; let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { + for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); @@ -1334,31 +1335,22 @@ mod tests { assert_current_logical_size(&*tline, Lsn(lsn)); - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE + 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE + 1); // Truncate one block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate another block lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; - assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn))?, - pg_constants::RELSEG_SIZE - 1 - ); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn))?, RELSEG_SIZE - 1); assert_current_logical_size(&*tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time From 976576ae599f1fa742c467f22798bc3c39be049d Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 18 Aug 2022 13:38:23 +0300 Subject: [PATCH 20/21] Fix walreceiver and safekeeper bugs (#2295) - There was an issue with zero commit_lsn `reason: LaggingWal { current_commit_lsn: 0/0, new_commit_lsn: 1/6FD90D38, threshold: 10485760 } }`. The problem was in `send_wal.rs`, where we initialized `end_pos = Lsn(0)` and in some cases sent it to the pageserver. - IDENTIFY_SYSTEM previously returned `flush_lsn` as a physical end of WAL. Now it returns `flush_lsn` (as it was) to walproposer and `commit_lsn` to everyone else including pageserver. - There was an issue with backoff where connection was cancelled right after initialization: `connected!` -> `safekeeper_handle_db: Connection cancelled` -> `Backoff: waiting 3 seconds`. The problem was in sleeping before establishing the connection. This is fixed by reworking retry logic. - There was an issue with getting `NoKeepAlives` reason in a loop. The issue is probably the same as the previous. - There was an issue with filtering safekeepers based on retry attempts, which could filter some safekeepers indefinetely. This is fixed by using retry cooldown duration instead of retry attempts. - Some `send_wal.rs` connections failed with errors without context. This is fixed by adding a timeline to safekeepers errors. New retry logic works like this: - Every candidate has a `next_retry_at` timestamp and is not considered for connection until that moment - When walreceiver connection is closed, we update `next_retry_at` using exponential backoff, increasing the cooldown on every disconnect. - When `last_record_lsn` was advanced using the WAL from the safekeeper, we reset the retry cooldown and exponential backoff, allowing walreceiver to reconnect to the same safekeeper instantly. --- .../src/walreceiver/connection_manager.rs | 200 ++++++++++++------ .../src/walreceiver/walreceiver_connection.rs | 33 ++- safekeeper/src/handler.rs | 49 +++-- safekeeper/src/send_wal.rs | 12 +- 4 files changed, 189 insertions(+), 105 deletions(-) diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index 0f11a2197a..e8e0a7c52b 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -96,6 +96,8 @@ async fn connection_manager_loop_step( info!("Subscribed for etcd timeline changes, waiting for new etcd data"); loop { + let time_until_next_retry = walreceiver_state.time_until_next_retry(); + select! { broker_connection_result = &mut broker_subscription.watcher_handle => { cleanup_broker_connection(broker_connection_result, walreceiver_state); @@ -110,27 +112,23 @@ async fn connection_manager_loop_step( } => { let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Started => { - *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; - }, + TaskEvent::Started => {}, TaskEvent::NewEvent(status) => { - if status.has_received_wal { - // Reset connection attempts here only, we know that safekeeper is healthy - // because it can send us a WAL update. - walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); } wal_connection.status = status; }, TaskEvent::End(end_result) => { match end_result { Ok(()) => debug!("WAL receiving task finished"), - Err(e) => { - warn!("WAL receiving task failed: {e}"); - // If the task failed, set the connection attempts to at least 1, to try other safekeepers. - let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1); - } + Err(e) => warn!("WAL receiving task failed: {e}"), }; - walreceiver_state.wal_connection = None; + walreceiver_state.drop_old_connection(false).await; }, } }, @@ -154,6 +152,8 @@ async fn connection_manager_loop_step( } } }, + + _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} } // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. @@ -234,6 +234,10 @@ async fn subscribe_for_timeline_updates( } } +const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1; +const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; +const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; + /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. struct WalreceiverState { id: ZTenantTimelineId, @@ -247,7 +251,8 @@ struct WalreceiverState { max_lsn_wal_lag: NonZeroU64, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, - wal_connection_attempts: HashMap, + /// Info about retries and unsuccessful attempts to connect to safekeepers. + wal_connection_retries: HashMap, /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. wal_stream_candidates: HashMap, } @@ -255,6 +260,8 @@ struct WalreceiverState { /// Current connection data. #[derive(Debug)] struct WalConnection { + /// Time when the connection was initiated. + started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, /// Status of the connection. @@ -274,6 +281,12 @@ struct NewCommittedWAL { discovered_at: NaiveDateTime, } +#[derive(Debug)] +struct RetryInfo { + next_retry_at: Option, + retry_duration_seconds: f64, +} + /// Data about the timeline to connect to, received from etcd. #[derive(Debug)] struct EtcdSkTimeline { @@ -300,31 +313,18 @@ impl WalreceiverState { max_lsn_wal_lag, wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) { - if let Some(old_connection) = self.wal_connection.take() { - old_connection.connection_task.shutdown().await - } + self.drop_old_connection(true).await; let id = self.id; let connect_timeout = self.wal_connect_timeout; - let connection_attempt = self - .wal_connection_attempts - .get(&new_sk_id) - .copied() - .unwrap_or(0); let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { async move { - exponential_backoff( - connection_attempt, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ) - .await; super::walreceiver_connection::handle_walreceiver_connection( id, &new_wal_source_connstr, @@ -340,10 +340,11 @@ impl WalreceiverState { let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { + started_at: now, sk_id: new_sk_id, status: WalConnectionStatus { is_connected: false, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: now, latest_wal_update: now, streaming_lsn: None, @@ -354,6 +355,71 @@ impl WalreceiverState { }); } + /// Drops the current connection (if any) and updates retry timeout for the next + /// connection attempt to the same safekeeper. + async fn drop_old_connection(&mut self, needs_shutdown: bool) { + let wal_connection = match self.wal_connection.take() { + Some(wal_connection) => wal_connection, + None => return, + }; + + if needs_shutdown { + wal_connection.connection_task.shutdown().await; + } + + let retry = self + .wal_connection_retries + .entry(wal_connection.sk_id) + .or_insert(RetryInfo { + next_retry_at: None, + retry_duration_seconds: WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS, + }); + + let now = Utc::now().naive_utc(); + + // Schedule the next retry attempt. We want to have exponential backoff for connection attempts, + // and we add backoff to the time when we started the connection attempt. If the connection + // was active for a long time, then next_retry_at will be in the past. + retry.next_retry_at = + wal_connection + .started_at + .checked_add_signed(chrono::Duration::milliseconds( + (retry.retry_duration_seconds * 1000.0) as i64, + )); + + if let Some(next) = &retry.next_retry_at { + if next > &now { + info!( + "Next connection retry to {:?} is at {}", + wal_connection.sk_id, next + ); + } + } + + let next_retry_duration = + retry.retry_duration_seconds * WALCONNECTION_RETRY_BACKOFF_MULTIPLIER; + // Clamp the next retry duration to the maximum allowed. + let next_retry_duration = next_retry_duration.min(WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS); + // Clamp the next retry duration to the minimum allowed. + let next_retry_duration = next_retry_duration.max(WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS); + + retry.retry_duration_seconds = next_retry_duration; + } + + /// Returns time needed to wait to have a new candidate for WAL streaming. + fn time_until_next_retry(&self) -> Option { + let now = Utc::now().naive_utc(); + + let next_retry_at = self + .wal_connection_retries + .values() + .filter_map(|retry| retry.next_retry_at) + .filter(|next_retry_at| next_retry_at > &now) + .min(); + + next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + } + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { match self @@ -547,52 +613,37 @@ impl WalreceiverState { /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: - /// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node) - /// * has greatest data Lsn among the ones that are left - /// - /// NOTE: - /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but - /// otherwise to reset the connection attempts, a successful connection to that node is needed. - /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored. + /// * has no pending retry cooldown + /// * has greatest commit_lsn among the ones that are left fn select_connection_candidate( &self, node_to_omit: Option, ) -> Option<(NodeId, &SkTimelineInfo, String)> { - let all_candidates = self - .applicable_connection_candidates() + self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) - .collect::>(); - - let smallest_attempts_allowed = all_candidates - .iter() - .map(|(sk_id, _, _)| { - self.wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) - .min()?; - - all_candidates - .into_iter() - .filter(|(sk_id, _, _)| { - smallest_attempts_allowed - >= self - .wal_connection_attempts - .get(sk_id) - .copied() - .unwrap_or(0) - }) .max_by_key(|(_, info, _)| info.commit_lsn) } /// Returns a list of safekeepers that have valid info and ready for connection. + /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, ) -> impl Iterator { + let now = Utc::now().naive_utc(); + self.wal_stream_candidates .iter() .filter(|(_, info)| info.timeline.commit_lsn.is_some()) + .filter(move |(sk_id, _)| { + let next_retry_at = self + .wal_connection_retries + .get(sk_id) + .and_then(|retry_info| { + retry_info.next_retry_at + }); + + next_retry_at.is_none() || next_retry_at.unwrap() <= now + }) .filter_map(|(sk_id, etcd_info)| { let info = &etcd_info.timeline; match wal_stream_connection_string( @@ -627,7 +678,7 @@ impl WalreceiverState { }); for node_id in node_ids_to_remove { - self.wal_connection_attempts.remove(&node_id); + self.wal_connection_retries.remove(&node_id); } } } @@ -684,7 +735,6 @@ fn wal_stream_connection_string( #[cfg(test)] mod tests { - use crate::repository::{ repo_harness::{RepoHarness, TIMELINE_ID}, Repository, @@ -789,7 +839,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(Lsn(current_lsn)), @@ -798,6 +848,7 @@ mod tests { state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1017,7 +1068,13 @@ mod tests { }, ), ]); - state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]); + state.wal_connection_retries = HashMap::from([( + NodeId(0), + RetryInfo { + next_retry_at: now.checked_add_signed(chrono::Duration::hours(1)), + retry_duration_seconds: WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS, + }, + )]); let candidate_with_less_errors = state .next_connection_candidate() @@ -1025,7 +1082,7 @@ mod tests { assert_eq!( candidate_with_less_errors.safekeeper_id, NodeId(1), - "Should select the node with less connection errors" + "Should select the node with no pending retry cooldown" ); Ok(()) @@ -1043,7 +1100,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: now, commit_lsn: Some(current_lsn), @@ -1051,6 +1108,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: connected_sk_id, status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1130,7 +1188,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: time_over_threshold, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1138,6 +1196,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status.clone(), connection_task: TaskHandle::spawn(move |sender, _| async move { @@ -1202,7 +1261,7 @@ mod tests { let connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: true, + has_processed_wal: true, latest_connection_update: now, latest_wal_update: time_over_threshold, commit_lsn: Some(current_lsn), @@ -1210,6 +1269,7 @@ mod tests { }; state.wal_connection = Some(WalConnection { + started_at: now, sk_id: NodeId(1), status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), @@ -1281,7 +1341,7 @@ mod tests { max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), wal_connection: None, wal_stream_candidates: HashMap::new(), - wal_connection_attempts: HashMap::new(), + wal_connection_retries: HashMap::new(), } } } diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index 0688086117..025bfeb506 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -35,8 +35,9 @@ use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; pub struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, - /// Defines a healthy connection as one on which we have received at least some WAL bytes. - pub has_received_wal: bool, + /// Defines a healthy connection as one on which pageserver received WAL from safekeeper + /// and is able to process it in walingest without errors. + pub has_processed_wal: bool, /// Connection establishment time or the timestamp of a latest connection message received. pub latest_connection_update: NaiveDateTime, /// Time of the latest WAL message received. @@ -71,7 +72,7 @@ pub async fn handle_walreceiver_connection( info!("connected!"); let mut connection_status = WalConnectionStatus { is_connected: true, - has_received_wal: false, + has_processed_wal: false, latest_connection_update: Utc::now().naive_utc(), latest_wal_update: Utc::now().naive_utc(), streaming_lsn: None, @@ -117,13 +118,6 @@ pub async fn handle_walreceiver_connection( let identify = identify_system(&mut replication_client).await?; info!("{identify:?}"); - connection_status.latest_connection_update = Utc::now().naive_utc(); - if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { - warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); - return Ok(()); - } - - // NB: this is a flush_lsn, not a commit_lsn. let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); let mut caught_up = false; let ZTenantTimelineId { @@ -131,6 +125,14 @@ pub async fn handle_walreceiver_connection( timeline_id, } = id; + connection_status.latest_connection_update = Utc::now().naive_utc(); + connection_status.latest_wal_update = Utc::now().naive_utc(); + connection_status.commit_lsn = Some(end_of_wal); + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + return Ok(()); + } + let (repo, timeline) = tokio::task::spawn_blocking(move || { let repo = tenant_mgr::get_repository_for_tenant(tenant_id) .with_context(|| format!("no repository found for tenant {tenant_id}"))?; @@ -181,6 +183,7 @@ pub async fn handle_walreceiver_connection( } { let replication_message = replication_message?; let now = Utc::now().naive_utc(); + let last_rec_lsn_before_msg = last_rec_lsn; // Update the connection status before processing the message. If the message processing // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. @@ -193,7 +196,6 @@ pub async fn handle_walreceiver_connection( )); if !xlog_data.data().is_empty() { connection_status.latest_wal_update = now; - connection_status.has_received_wal = true; } } ReplicationMessage::PrimaryKeepAlive(keepalive) => { @@ -265,6 +267,15 @@ pub async fn handle_walreceiver_connection( _ => None, }; + if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { + // We have successfully processed at least one WAL record. + connection_status.has_processed_wal = true; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + let timeline_to_check = Arc::clone(&timeline); tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) .await diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 63bc9bd517..c90c2a0446 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -90,7 +90,10 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - info!("got query {:?}", query_string); + info!( + "got query {:?} in timeline {:?}", + query_string, self.ztimelineid + ); let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); @@ -106,23 +109,17 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } match cmd { - SafekeeperPostgresCommand::StartWalPush => { - ReceiveWalConn::new(pgb) - .run(self) - .context("failed to run ReceiveWalConn")?; - } - SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb) - .run(self, pgb, start_lsn) - .context("failed to run ReplicationConn")?; - } - SafekeeperPostgresCommand::IdentifySystem => { - self.handle_identify_system(pgb)?; - } - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd)?; - } + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb) + .run(self) + .context("failed to run ReceiveWalConn"), + SafekeeperPostgresCommand::StartReplication { start_lsn } => ReplicationConn::new(pgb) + .run(self, pgb, start_lsn) + .context("failed to run ReplicationConn"), + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), } + .context(format!("timeline {timelineid}"))?; + Ok(()) } } @@ -153,8 +150,15 @@ impl SafekeeperPostgresHandler { /// Handle IDENTIFY_SYSTEM replication command /// fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { - let start_pos = self.timeline.get().get_end_of_wal(); - let lsn = start_pos.to_string(); + let lsn = if self.is_walproposer_recovery() { + // walproposer should get all local WAL until flush_lsn + self.timeline.get().get_end_of_wal() + } else { + // other clients shouldn't get any uncommitted WAL + self.timeline.get().get_state().0.commit_lsn + } + .to_string(); + let sysid = self .timeline .get() @@ -203,4 +207,11 @@ impl SafekeeperPostgresHandler { .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; Ok(()) } + + /// Returns true if current connection is a replication connection, originating + /// from a walproposer recovery function. This connection gets a special handling: + /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not. + pub fn is_walproposer_recovery(&self) -> bool { + self.appname == Some("wal_proposer_recovery".to_string()) + } } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 243d7bf7d0..97ec945c3e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -170,6 +170,7 @@ impl ReplicationConn { // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); let bg_stream_in = self.stream_in.take().unwrap(); + let bg_timeline_id = spg.ztimelineid.unwrap(); let state = ReplicaState::new(); // This replica_id is used below to check if it's time to stop replication. @@ -188,6 +189,8 @@ impl ReplicationConn { let _ = thread::Builder::new() .name("HotStandbyFeedback thread".into()) .spawn(move || { + let _enter = + info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { error!("Replication background thread failed: {}", err); } @@ -198,13 +201,12 @@ impl ReplicationConn { .build()?; runtime.block_on(async move { - let (_, persisted_state) = spg.timeline.get().get_state(); + let (inmem_state, persisted_state) = spg.timeline.get().get_state(); // add persisted_state.timeline_start_lsn == Lsn(0) check if persisted_state.server.wal_seg_size == 0 { bail!("Cannot start replication before connecting to walproposer"); } - let wal_end = spg.timeline.get().get_end_of_wal(); // Walproposer gets special handling: safekeeper must give proposer all // local WAL till the end, whether committed or not (walproposer will // hang otherwise). That's because walproposer runs the consensus and @@ -214,8 +216,8 @@ impl ReplicationConn { // another compute rises which collects majority and starts fixing log // on this safekeeper itself. That's ok as (old) proposer will never be // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) - { + let stop_pos: Option = if spg.is_walproposer_recovery() { + let wal_end = spg.timeline.get().get_end_of_wal(); Some(wal_end) } else { None @@ -226,7 +228,7 @@ impl ReplicationConn { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse)?; - let mut end_pos = Lsn(0); + let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( spg.conf.timeline_dir(&spg.timeline.get().zttid), From 77a2bdf3d7714765b1673bec59bdace149332c47 Mon Sep 17 00:00:00 2001 From: Anton Galitsyn Date: Thu, 18 Aug 2022 19:05:40 +0700 Subject: [PATCH 21/21] on safekeeper registration pass availability zone param (#2292) --- .github/ansible/scripts/init_safekeeper.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh index a9b5025562..879891e7a3 100644 --- a/.github/ansible/scripts/init_safekeeper.sh +++ b/.github/ansible/scripts/init_safekeeper.sh @@ -1,7 +1,8 @@ #!/bin/sh -# get instance id from meta-data service +# fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) # store fqdn hostname in var HOST=$(hostname -f) @@ -14,7 +15,8 @@ cat <