Merge remote-tracking branch 'origin/main' into problame/benchmarking/pr/python-perftest

This commit is contained in:
Alexander Bayandin
2024-01-03 13:30:49 +00:00
125 changed files with 6196 additions and 1686 deletions

2
.config/nextest.toml Normal file
View File

@@ -0,0 +1,2 @@
[profile.default]
slow-timeout = "1m"

View File

@@ -0,0 +1,105 @@
name: Build and Push Docker Image
on:
workflow_call:
inputs:
dockerfile-path:
required: true
type: string
image-name:
required: true
type: string
outputs:
build-tools-tag:
description: "tag generated for build tools"
value: ${{ jobs.tag.outputs.build-tools-tag }}
jobs:
check-if-build-tools-dockerfile-changed:
runs-on: ubuntu-latest
outputs:
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
steps:
- name: Check if Dockerfile.buildtools has changed
id: dockerfile
run: |
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
exit
fi
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
tag:
runs-on: ubuntu-latest
needs: [ check-if-build-tools-dockerfile-changed ]
outputs:
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
steps:
- name: Get buildtools tag
env:
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
run: |
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
IMAGE_TAG=$GITHUB_RUN_ID
else
IMAGE_TAG=pinned
fi
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
shell: bash
id: buildtools-tag
kaniko:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, arm64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
name: 'manifest'
runs-on: [ self-hosted, dev, x64 ]
needs:
- tag
- kaniko
- kaniko-arm
- check-if-build-tools-dockerfile-changed
steps:
- name: Create manifest
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -44,7 +44,6 @@ jobs:
exit 1
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
@@ -74,11 +73,19 @@ jobs:
shell: bash
id: build-tag
check-codestyle-python:
build-buildtools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
dockerfile-path: Dockerfile.buildtools
image-name: build-tools
secrets: inherit
check-codestyle-python:
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -108,10 +115,10 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
needs: [ check-permissions ]
needs: [ check-permissions, build-buildtools-image ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -175,10 +182,10 @@ jobs:
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions, tag ]
needs: [ check-permissions, tag, build-buildtools-image ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
strategy:
fail-fast: false
@@ -332,16 +339,16 @@ jobs:
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
- name: Run rust tests
run: |
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -351,7 +358,7 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
- name: Install rust binaries
run: |
@@ -408,10 +415,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, tag ]
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Default shared memory is 64mb
options: --init --shm-size=512mb
strategy:
@@ -447,10 +454,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
benchmarks:
needs: [ check-permissions, build-neon ]
needs: [ check-permissions, build-neon, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# Default shared memory is 64mb
options: --init --shm-size=512mb
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +486,12 @@ jobs:
# while coverage is currently collected for the debug ones
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
steps:
@@ -526,11 +533,10 @@ jobs:
})
coverage-report:
needs: [ check-permissions, regress-tests ]
needs: [ check-permissions, regress-tests, build-buildtools-image ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
options: --init
strategy:
fail-fast: false
@@ -694,7 +700,7 @@ jobs:
}"
neon-image:
needs: [ check-permissions, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
@@ -733,6 +739,7 @@ jobs:
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -743,7 +750,7 @@ jobs:
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ check-permissions, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
@@ -778,6 +785,7 @@ jobs:
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +796,7 @@ jobs:
run: rm -rf ~/.ecr
compute-node-image:
needs: [ check-permissions, tag ]
needs: [ check-permissions, build-buildtools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +844,7 @@ jobs:
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

View File

@@ -218,7 +218,7 @@ jobs:
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

View File

@@ -0,0 +1,130 @@
name: 'Update build tools image tag'
# This workflow it used to update tag of build tools in ECR.
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
to-tag:
description: 'Destination tag'
required: true
type: string
default: 'pinned'
defaults:
run:
shell: bash -euo pipefail {0}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
outputs:
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Get source image digest
id: next-digest
run: |
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
exit 1
fi
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
- name: Get destination image digest (if already exists)
id: prev-digest
run: |
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
if [ -z "${PREV_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
else
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
fi
- name: Tag image
run: |
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
rollback-tag-image:
needs: tag-image
if: ${{ !success() }}
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Restore previous tag if needed
run: |
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
exit 0
fi
if [ -z "${PREV_DIGEST}" ]; then
# I guess we should delete the tag here/untag the image, but crane does not support it
# - https://github.com/google/go-containerregistry/issues/999
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
exit 0
fi
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
else
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
fi

1
.gitignore vendored
View File

@@ -6,6 +6,7 @@ __pycache__/
test_output/
.vscode
.idea
neon.iml
/.neon
/integration_tests/.neon

View File

@@ -70,3 +70,17 @@ We're using the following approach to make it work:
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
## How do I add the "pinned" tag to an buildtools image?
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
or using GitHub CLI:
```bash
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-f from-tag=6254913013 \
-f to-tag=pinned \
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
```

109
Cargo.lock generated
View File

@@ -1161,6 +1161,7 @@ dependencies = [
"flate2",
"futures",
"hyper",
"nix 0.26.2",
"notify",
"num_cpus",
"opentelemetry",
@@ -1168,8 +1169,10 @@ dependencies = [
"regex",
"remote_storage",
"reqwest",
"rust-ini",
"serde",
"serde_json",
"signal-hook",
"tar",
"tokio",
"tokio-postgres",
@@ -1201,6 +1204,26 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
[[package]]
name = "const-random"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
dependencies = [
"const-random-macro",
]
[[package]]
name = "const-random-macro"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
dependencies = [
"getrandom 0.2.11",
"once_cell",
"tiny-keccak",
]
[[package]]
name = "const_fn"
version = "0.4.9"
@@ -1433,6 +1456,12 @@ dependencies = [
"winapi",
]
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "crypto-bigint"
version = "0.4.9"
@@ -1575,6 +1604,15 @@ dependencies = [
"syn 2.0.32",
]
[[package]]
name = "dlv-list"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
dependencies = [
"const-random",
]
[[package]]
name = "dyn-clone"
version = "1.0.14"
@@ -2106,6 +2144,20 @@ dependencies = [
"hashbrown 0.13.2",
]
[[package]]
name = "hdrhistogram"
version = "7.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
dependencies = [
"base64 0.21.1",
"byteorder",
"crossbeam-channel",
"flate2",
"nom",
"num-traits",
]
[[package]]
name = "heapless"
version = "0.8.0"
@@ -3029,6 +3081,16 @@ dependencies = [
"tokio-stream",
]
[[package]]
name = "ordered-multimap"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
dependencies = [
"dlv-list",
"hashbrown 0.14.0",
]
[[package]]
name = "os_info"
version = "3.7.0"
@@ -3057,6 +3119,28 @@ dependencies = [
"sha2",
]
[[package]]
name = "pagebench"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"futures",
"hdrhistogram",
"humantime",
"humantime-serde",
"pageserver",
"pageserver_api",
"pageserver_client",
"rand 0.8.5",
"serde",
"serde_json",
"tokio",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "pagectl"
version = "0.1.0"
@@ -4180,6 +4264,16 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "rust-ini"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
dependencies = [
"cfg-if",
"ordered-multimap",
]
[[package]]
name = "rustc-demangle"
version = "0.1.23"
@@ -4311,12 +4405,14 @@ dependencies = [
"async-stream",
"aws-config",
"aws-sdk-s3",
"aws-smithy-async",
"bincode",
"bytes",
"chrono",
"clap",
"crc32c",
"either",
"futures",
"futures-util",
"hex",
"histogram",
@@ -4355,6 +4451,7 @@ dependencies = [
"clap",
"const_format",
"crc32c",
"fail",
"fs2",
"futures",
"git-version",
@@ -5134,6 +5231,15 @@ dependencies = [
"time-core",
]
[[package]]
name = "tiny-keccak"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
dependencies = [
"crunchy",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
@@ -5777,6 +5883,7 @@ dependencies = [
"chrono",
"const_format",
"criterion",
"fail",
"futures",
"heapless",
"hex",
@@ -6301,6 +6408,7 @@ dependencies = [
"futures-io",
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hex",
"hmac",
"hyper",
@@ -6312,6 +6420,7 @@ dependencies = [
"num-bigint",
"num-integer",
"num-traits",
"once_cell",
"prost",
"rand 0.8.5",
"regex",

View File

@@ -6,6 +6,7 @@ members = [
"pageserver",
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
"proxy",
"safekeeper",
"storage_broker",
@@ -79,6 +80,7 @@ futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.13"
hashlink = "0.8.1"
hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
hmac = "0.12.1"

View File

@@ -3,7 +3,7 @@
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
### inside this image in the real deployments.
ARG REPOSITORY=neondatabase
ARG IMAGE=rust
ARG IMAGE=build-tools
ARG TAG=pinned
# Build Postgres

166
Dockerfile.buildtools Normal file
View File

@@ -0,0 +1,166 @@
FROM debian:bullseye-slim
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
# System deps
RUN set -e \
&& apt update \
&& apt install -y \
autoconf \
automake \
bison \
build-essential \
ca-certificates \
cmake \
curl \
flex \
git \
gnupg \
gzip \
jq \
libcurl4-openssl-dev \
libbz2-dev \
libffi-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libpq-dev \
libreadline-dev \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
libstdc++-10-dev \
libtool \
libxml2-dev \
libxmlsec1-dev \
libxxhash-dev \
lsof \
make \
netcat \
net-tools \
openssh-client \
parallel \
pkg-config \
unzip \
wget \
xz-utils \
zlib1g-dev \
zstd \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# protobuf-compiler (protoc)
ENV PROTOC_VERSION 25.1
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
&& unzip -q protoc.zip -d protoc \
&& mv protoc/bin/protoc /usr/local/bin/protoc \
&& mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc
# LLVM
ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# PostgreSQL 14
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
&& apt update \
&& apt install -y postgresql-client-14 \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
&& unzip -q awscliv2.zip \
&& ./aws/install \
&& rm awscliv2.zip
# Mold: A Modern Linker
ENV MOLD_VERSION v2.4.0
RUN set -e \
&& git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \
&& cd mold/build \
&& git checkout ${MOLD_VERSION} \
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
&& cmake --build . -j $(nproc) \
&& cmake --install . \
&& cd .. \
&& rm -rf mold
# LCOV
# Build lcov from a fork:
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
# And patches from us:
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
&& cd lcov \
&& make install \
&& rm -rf ../lcov.tar.gz
# Switch to nonroot user
USER nonroot:nonroot
WORKDIR /home/nonroot
# Python
ENV PYTHON_VERSION=3.9.2 \
PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \
&& cd $HOME \
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
&& chmod +x pyenv-installer \
&& ./pyenv-installer \
&& export PYENV_ROOT=/home/nonroot/.pyenv \
&& export PATH="$PYENV_ROOT/bin:$PATH" \
&& export PATH="$PYENV_ROOT/shims:$PATH" \
&& pyenv install ${PYTHON_VERSION} \
&& pyenv global ${PYTHON_VERSION} \
&& python --version \
&& pip install --upgrade pip \
&& pip --version \
&& pip install pipenv wheel poetry
# Switch to nonroot user (again)
USER nonroot:nonroot
WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.74.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
rm rustup-init && \
export PATH="$HOME/.cargo/bin:$PATH" && \
. "$HOME/.cargo/env" && \
cargo --version && rustup --version && \
rustup component add llvm-tools-preview rustfmt clippy && \
cargo install --git https://github.com/paritytech/cachepot && \
cargo install rustfilt && \
cargo install cargo-hakari && \
cargo install cargo-deny && \
cargo install cargo-hack && \
cargo install cargo-nextest && \
rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git
ENV RUSTC_WRAPPER=cachepot
# Show versions
RUN whoami \
&& python --version \
&& pip --version \
&& cargo --version --verbose \
&& rustup --version --verbose \
&& rustc --version --verbose \
&& clang --version

View File

@@ -1,6 +1,6 @@
ARG PG_VERSION
ARG REPOSITORY=neondatabase
ARG IMAGE=rust
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
@@ -48,7 +48,29 @@ RUN cd postgres && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
# In vanilla postgres this function is limited to Postgres role superuser.
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
# so we do it here.
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
# the first loop is for pg_stat_statement extension version <= 1.6
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
fi; \
done; \
# the second loop is for pg_stat_statement extension versions >= 1.7,
# where pg_stat_statement_reset() got 3 additional arguments
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
filename=$(basename "$file"); \
if ! echo "$old_list" | grep -q -F "$filename"; then \
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
fi; \
done
#########################################################################################
#

View File

@@ -1,7 +1,7 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
ARG REPOSITORY=neondatabase
ARG IMAGE=rust
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG

View File

@@ -13,6 +13,7 @@ clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true
opentelemetry.workspace = true
@@ -20,6 +21,7 @@ postgres.workspace = true
regex.workspace = true
serde.workspace = true
serde_json.workspace = true
signal-hook.workspace = true
tar.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
@@ -39,3 +41,4 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.13"
bytes = "1.0"
rust-ini = "0.20.0"

View File

@@ -31,25 +31,31 @@
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway
//! -r http://pg-ext-s3-gateway \
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
//! ```
//!
use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
use std::process::exit;
use std::sync::atomic::Ordering;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use nix::sys::signal::{kill, Signal};
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::get_pg_version;
use compute_tools::http::api::launch_http_server;
@@ -65,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
thread::spawn(move || {
for sig in signals.forever() {
handle_exit_signal(sig);
}
});
let build_tag = option_env!("BUILD_TAG")
.unwrap_or(BUILD_TAG_DEFAULT)
.to_string();
@@ -99,6 +112,9 @@ fn main() -> Result<()> {
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -209,6 +225,8 @@ fn main() -> Result<()> {
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
};
let compute = Arc::new(compute_node);
@@ -339,6 +357,7 @@ fn main() -> Result<()> {
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
PG_PID.store(0, Ordering::SeqCst);
info!("Postgres exited with code {}, shutting down", ecode);
exit_code = ecode.code()
}
@@ -493,6 +512,41 @@ fn cli() -> clap::Command {
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-connstr")
.long("pgbouncer-connstr")
.default_value(
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
)
.value_name("PGBOUNCER_CONNSTR"),
)
.arg(
Arg::new("pgbouncer-ini-path")
.long("pgbouncer-ini-path")
// Note: this doesn't match current path for pgbouncer.ini.
// Until we fix it, we need to pass the path explicitly
// or this will be effectively no-op.
.default_value("/etc/pgbouncer.ini")
.value_name("PGBOUNCER_INI_PATH"),
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
/// wait for termination which would be easy then.
fn handle_exit_signal(sig: i32) {
info!("received {sig} termination signal");
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
if ss_pid != 0 {
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
kill(ss_pid, Signal::SIGTERM).ok();
}
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
kill(pg_pid, Signal::SIGTERM).ok();
}
exit(1);
}
#[test]

View File

@@ -6,7 +6,10 @@ use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use std::sync::{Condvar, Mutex, RwLock};
use std::thread;
use std::time::Instant;
use anyhow::{Context, Result};
@@ -33,6 +36,9 @@ use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::{config, extension_server};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
// Url type maintains proper escaping
@@ -64,6 +70,10 @@ pub struct ComputeNode {
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
// connection string to pgbouncer to change settings
pub pgbouncer_connstr: Option<String>,
// path to pgbouncer.ini to change settings
pub pgbouncer_ini_path: Option<String>,
}
// store some metrics about download size that might impact startup time
@@ -496,6 +506,7 @@ impl ComputeNode {
.stdout(Stdio::piped())
.spawn()
.expect("postgres --sync-safekeepers failed to start");
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
// `postgres --sync-safekeepers` will print all log output to stderr and
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -503,6 +514,7 @@ impl ComputeNode {
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
if !sync_output.status.success() {
anyhow::bail!(
@@ -657,6 +669,7 @@ impl ComputeNode {
})
.spawn()
.expect("cannot start postgres process");
PG_PID.store(pg.id(), Ordering::SeqCst);
wait_for_postgres(&mut pg, pgdata_path)?;
@@ -737,6 +750,31 @@ impl ComputeNode {
pub fn reconfigure(&self) -> Result<()> {
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create rt");
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
});
}
// Write new config
let pgdata_path = Path::new(&self.pgdata);
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -791,6 +829,32 @@ impl ComputeNode {
pspec.timeline_id,
);
// tune pgbouncer
if let Some(connstr) = &self.pgbouncer_connstr {
info!("tuning pgbouncer with connstr: {:?}", connstr);
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create rt");
// Spawn a thread to do the tuning,
// so that we don't block the main thread that starts Postgres.
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
let connstr_clone = connstr.clone();
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
let _handle = thread::spawn(move || {
let res = rt.block_on(tune_pgbouncer(
pgbouncer_settings,
&connstr_clone,
pgbouncer_ini_path,
));
if let Err(err) = res {
error!("error while tuning pgbouncer: {err:?}");
}
});
}
info!(
"start_compute spec.remote_extensions {:?}",
pspec.spec.remote_extensions

View File

@@ -9,9 +9,11 @@ use std::process::Child;
use std::time::{Duration, Instant};
use anyhow::{bail, Result};
use ini::Ini;
use notify::{RecursiveMode, Watcher};
use postgres::{Client, Transaction};
use tracing::{debug, instrument};
use tokio_postgres::NoTls;
use tracing::{debug, error, info, instrument};
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
@@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
Ok(())
}
/// Update pgbouncer.ini with provided options
pub fn update_pgbouncer_ini(
pgbouncer_config: HashMap<String, String>,
pgbouncer_ini_path: &str,
) -> Result<()> {
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
let section = conf.section_mut(Some("pgbouncer")).unwrap();
for (option_name, value) in pgbouncer_config.iter() {
section.insert(option_name, value);
}
conf.write_to_file(pgbouncer_ini_path)?;
Ok(())
}
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(
pgbouncer_settings: Option<HashMap<String, String>>,
pgbouncer_connstr: &str,
pgbouncer_ini_path: Option<String>,
) -> Result<()> {
if let Some(pgbouncer_config) = pgbouncer_settings {
// Apply new config
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
let (client, connection) = connect_result.unwrap();
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
for (option_name, value) in pgbouncer_config.iter() {
info!(
"Applying pgbouncer setting change: {} = {}",
option_name, value
);
let query = format!("SET {} = {}", option_name, value);
let result = client.simple_query(&query).await;
info!("Applying pgbouncer setting change: {}", query);
info!("pgbouncer setting change result: {:?}", result);
if let Err(err) = result {
// Don't fail on error, just print it into log
error!(
"Failed to apply pgbouncer setting change: {}, {}",
query, err
);
};
}
// save values to pgbouncer.ini
// so that they are preserved after pgbouncer restart
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
}
}
Ok(())
}

View File

@@ -46,6 +46,8 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::RemoteExtSpec;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
@@ -439,11 +441,14 @@ impl Endpoint {
Ok(())
}
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
if send_sigterm {
kill(pid, Signal::SIGTERM).ok();
}
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
Ok(())
}
@@ -537,6 +542,7 @@ impl Endpoint {
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -732,10 +738,15 @@ impl Endpoint {
&None,
)?;
// Also wait for the compute_ctl process to die. It might have some cleanup
// work to do after postgres stops, like syncing safekeepers, etc.
// Also wait for the compute_ctl process to die. It might have some
// cleanup work to do after postgres stops, like syncing safekeepers,
// etc.
//
self.wait_for_compute_ctl_to_exit()?;
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
// want this cleanup: tests intentionally do stop when majority of
// safekeepers is down, so sync-safekeepers would hang otherwise. This
// could be a separate flag though.
self.wait_for_compute_ctl_to_exit(destroy)?;
if destroy {
println!(
"Destroying postgres data directory '{}'",

View File

@@ -35,6 +35,7 @@ allow = [
"Artistic-2.0",
"BSD-2-Clause",
"BSD-3-Clause",
"CC0-1.0",
"ISC",
"MIT",
"MPL-2.0",

View File

@@ -73,6 +73,8 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
pub pgbouncer_settings: Option<HashMap<String, String>>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.

View File

@@ -243,5 +243,9 @@
"public_extensions": [
"postgis"
]
},
"pgbouncer_settings": {
"default_pool_size": "42",
"pool_mode": "session"
}
}

View File

@@ -559,19 +559,6 @@ pub enum DownloadRemoteLayersTaskState {
ShutDown,
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,

View File

@@ -81,6 +81,10 @@ impl TenantShardId {
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
}
/// Formatting helper
@@ -159,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId {
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,

View File

@@ -117,6 +117,8 @@ impl AzureBlobStorage {
) -> Result<Download, DownloadError> {
let mut response = builder.into_stream();
let mut etag = None;
let mut last_modified = None;
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
@@ -124,6 +126,13 @@ impl AzureBlobStorage {
let mut bufs = Vec::new();
while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?;
let etag_str: &str = part.blob.properties.etag.as_ref();
if etag.is_none() {
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
}
if last_modified.is_none() {
last_modified = Some(part.blob.properties.last_modified.into());
}
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
@@ -136,6 +145,8 @@ impl AzureBlobStorage {
}
Ok(Download {
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
etag,
last_modified,
metadata: Some(StorageMetadata(metadata)),
})
}

View File

@@ -14,7 +14,9 @@ mod local_fs;
mod s3_bucket;
mod simulate_failures;
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
use std::{
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
};
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
@@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
}
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
pub struct Download {
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
pub download_stream: DownloadStream,
/// The last time the file was modified (`last-modified` HTTP header)
pub last_modified: Option<SystemTime>,
/// A way to identify this specific version of the resource (`etag` HTTP header)
pub etag: Option<String>,
/// Extra key-value data, associated with the current remote file.
pub metadata: Option<StorageMetadata>,
}

View File

@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
use super::{RemoteStorage, StorageMetadata};
@@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs {
.map_err(DownloadError::Other)?;
Ok(Download {
metadata,
last_modified: None,
etag: None,
download_stream: Box::pin(source),
})
} else {
@@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs {
.await
.map_err(DownloadError::Other)?;
Ok(match end_exclusive {
Some(end_exclusive) => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(
source.take(end_exclusive - start_inclusive),
)),
},
None => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(source)),
},
let download_stream: DownloadStream = match end_exclusive {
Some(end_exclusive) => Box::pin(ReaderStream::new(
source.take(end_exclusive - start_inclusive),
)),
None => Box::pin(ReaderStream::new(source)),
};
Ok(Download {
metadata,
last_modified: None,
etag: None,
download_stream,
})
} else {
Err(DownloadError::NotFound)

View File

@@ -16,6 +16,7 @@ use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
profile::ProfileFileCredentialsProvider,
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -74,20 +75,29 @@ impl S3Bucket {
let region = Some(Region::new(aws_config.bucket_region.clone()));
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else("token", {
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
.or_else(
"token",
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build()
})
.build(),
)
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
@@ -218,17 +228,11 @@ impl S3Bucket {
let started_at = ScopeGuard::into_inner(started_at);
if get_object.is_err() {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
}
match get_object {
Ok(object_output) => {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
let etag = object_output.e_tag.clone();
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
let body = object_output.body;
let body = ByteStreamAsStream::from(body);
@@ -237,15 +241,33 @@ impl S3Bucket {
Ok(Download {
metadata,
etag,
last_modified,
download_stream: Box::pin(body),
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
Err(DownloadError::NotFound)
}
Err(e) => Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
)),
Err(e) => {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
))
}
}
}
}

View File

@@ -4,6 +4,12 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
[dependencies]
arc-swap.workspace = true
sentry.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true
nix.workspace = true

View File

@@ -1,3 +1,14 @@
//! Failpoint support code shared between pageserver and safekeepers.
use crate::http::{
error::ApiError,
json::{json_request, json_response},
};
use hyper::{Body, Request, Response, StatusCode};
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::*;
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
scenario
}
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
@@ -84,3 +95,45 @@ fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
/// Information for configuring a single fail point
#[derive(Debug, Serialize, Deserialize)]
pub struct FailpointConfig {
/// Name of the fail point
pub name: String,
/// List of actions to take, using the format described in `fail::cfg`
///
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
pub actions: String,
}
/// Configure failpoints through http.
pub async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Cannot manage failpoints because storage was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}

View File

@@ -83,6 +83,8 @@ pub mod timeout;
pub mod sync;
pub mod failpoint_support;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -366,6 +366,49 @@ impl MonotonicCounter<Lsn> for RecordLsn {
}
}
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
///
/// This is used by the `pagebench` pageserver benchmarking tool.
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
impl rand::distributions::uniform::SampleUniform for Lsn {
type Sampler = LsnSampler;
}
impl rand::distributions::uniform::UniformSampler for LsnSampler {
type X = Lsn;
fn new<B1, B2>(low: B1, high: B2) -> Self
where
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
{
Self(
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
low.borrow().0,
high.borrow().0,
),
)
}
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
where
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
{
Self(
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
low.borrow().0,
high.borrow().0,
),
)
}
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
Lsn(self.0.sample(rng))
}
}
#[cfg(test)]
mod tests {
use crate::bin_ser::BeSer;

View File

@@ -8,12 +8,12 @@ use std::ffi::CString;
use crate::bindings::uint32;
use crate::bindings::walproposer_api;
use crate::bindings::NeonWALReadResult;
use crate::bindings::PGAsyncReadResult;
use crate::bindings::PGAsyncWriteResult;
use crate::bindings::Safekeeper;
use crate::bindings::Size;
use crate::bindings::StringInfoData;
use crate::bindings::TimeLineID;
use crate::bindings::TimestampTz;
use crate::bindings::WalProposer;
use crate::bindings::WalProposerConnStatusType;
@@ -178,31 +178,11 @@ extern "C" fn conn_blocking_write(
}
}
extern "C" fn recovery_download(
sk: *mut Safekeeper,
_timeline: TimeLineID,
startpos: XLogRecPtr,
endpos: XLogRecPtr,
) -> bool {
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
unsafe {
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).recovery_download(&mut (*sk), startpos, endpos)
}
}
#[allow(clippy::unnecessary_cast)]
extern "C" fn wal_read(
sk: *mut Safekeeper,
buf: *mut ::std::os::raw::c_char,
startptr: XLogRecPtr,
count: Size,
) {
unsafe {
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).wal_read(&mut (*sk), buf, startptr)
(*api).recovery_download(&mut (*wp), &mut (*sk))
}
}
@@ -214,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
}
}
extern "C" fn free_event_set(wp: *mut WalProposer) {
#[allow(clippy::unnecessary_cast)]
extern "C" fn wal_read(
sk: *mut Safekeeper,
buf: *mut ::std::os::raw::c_char,
startptr: XLogRecPtr,
count: Size,
_errmsg: *mut *mut ::std::os::raw::c_char,
) -> NeonWALReadResult {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).free_event_set(&mut (*wp));
// TODO: errmsg is not forwarded
(*api).wal_read(&mut (*sk), buf, startptr)
}
}
extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
unsafe {
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).wal_reader_events(&mut (*sk))
}
}
@@ -238,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
}
}
extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
unsafe {
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).active_state_update_event_set(&mut (*sk));
}
}
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
unsafe {
let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -246,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
}
}
extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
unsafe {
let callback_data = (*(*(*sk).wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).rm_safekeeper_event_set(&mut (*sk));
}
}
extern "C" fn wait_event_set(
wp: *mut WalProposer,
timeout: ::std::os::raw::c_long,
@@ -313,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
}
}
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).confirm_wal_streamed(&mut (*wp), lsn)
}
}
extern "C" fn log_internal(
wp: *mut WalProposer,
level: ::std::os::raw::c_int,
@@ -335,14 +340,6 @@ extern "C" fn log_internal(
}
}
extern "C" fn after_election(wp: *mut WalProposer) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).after_election(&mut (*wp))
}
}
#[derive(Debug)]
pub enum Level {
Debug5,
@@ -401,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api {
conn_async_write: Some(conn_async_write),
conn_blocking_write: Some(conn_blocking_write),
recovery_download: Some(recovery_download),
wal_read: Some(wal_read),
wal_reader_allocate: Some(wal_reader_allocate),
free_event_set: Some(free_event_set),
wal_read: Some(wal_read),
wal_reader_events: Some(wal_reader_events),
init_event_set: Some(init_event_set),
update_event_set: Some(update_event_set),
active_state_update_event_set: Some(active_state_update_event_set),
add_safekeeper_event_set: Some(add_safekeeper_event_set),
rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
wait_event_set: Some(wait_event_set),
strong_random: Some(strong_random),
get_redo_start_lsn: Some(get_redo_start_lsn),
finish_sync_safekeepers: Some(finish_sync_safekeepers),
process_safekeeper_feedback: Some(process_safekeeper_feedback),
confirm_wal_streamed: Some(confirm_wal_streamed),
log_internal: Some(log_internal),
after_election: Some(after_election),
}
}

View File

@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
use crate::{
api_bindings::{create_api, take_vec_u8, Level},
bindings::{
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
WalProposerStart,
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
WalProposerFree, WalProposerStart,
},
};
@@ -86,19 +86,19 @@ pub trait ApiImpl {
todo!()
}
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
todo!()
}
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
todo!()
}
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
todo!()
}
fn free_event_set(&self, _wp: &mut WalProposer) {
fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
todo!()
}
@@ -110,10 +110,18 @@ pub trait ApiImpl {
todo!()
}
fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
todo!()
}
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
todo!()
}
fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
todo!()
}
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
todo!()
}
@@ -134,10 +142,6 @@ pub trait ApiImpl {
todo!()
}
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
todo!()
}
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
todo!()
}
@@ -240,6 +244,7 @@ impl Drop for Wrapper {
#[cfg(test)]
mod tests {
use core::panic;
use std::{
cell::Cell,
sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -247,7 +252,7 @@ mod tests {
use utils::id::TenantTimelineId;
use crate::{api_bindings::Level, walproposer::Wrapper};
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
use super::ApiImpl;
@@ -355,12 +360,17 @@ mod tests {
true
}
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
println!("wal_reader_allocate")
fn recovery_download(
&self,
_wp: &mut crate::bindings::WalProposer,
_sk: &mut crate::bindings::Safekeeper,
) -> bool {
true
}
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
println!("free_event_set")
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
println!("wal_reader_allocate");
crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
}
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -383,6 +393,13 @@ mod tests {
self.wait_events.set(WaitEventsData { sk, event_mask });
}
fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
println!(
"rm_safekeeper_event_set, sk={:?}",
sk as *mut crate::bindings::Safekeeper
);
}
fn wait_event_set(
&self,
_: &mut crate::bindings::WalProposer,
@@ -408,7 +425,7 @@ mod tests {
}
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
println!("walprop_log[{}] {}", level, msg);
println!("wp_log[{}] {}", level, msg);
}
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {

View File

@@ -5,6 +5,8 @@ use utils::{
id::{TenantId, TimelineId},
};
pub mod util;
#[derive(Debug)]
pub struct Client {
mgmt_api_endpoint: String,

View File

@@ -0,0 +1,49 @@
//! Helpers to do common higher-level tasks with the [`Client`].
use std::sync::Arc;
use tokio::task::JoinSet;
use utils::id::{TenantId, TenantTimelineId};
use super::Client;
/// Retrieve a list of all of the pageserver's timelines.
///
/// Fails if there are sharded tenants present on the pageserver.
pub async fn get_pageserver_tenant_timelines_unsharded(
api_client: &Arc<Client>,
) -> anyhow::Result<Vec<TenantTimelineId>> {
let mut timelines: Vec<TenantTimelineId> = Vec::new();
let mut tenants: Vec<TenantId> = Vec::new();
for ti in api_client.list_tenants().await? {
if !ti.id.is_unsharded() {
anyhow::bail!(
"only unsharded tenants are supported at this time: {}",
ti.id
);
}
tenants.push(ti.id.tenant_id)
}
let mut js = JoinSet::new();
for tenant_id in tenants {
js.spawn({
let mgmt_api_client = Arc::clone(api_client);
async move {
(
tenant_id,
mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
)
}
});
}
while let Some(res) = js.join_next().await {
let (tenant_id, details) = res.unwrap();
for timeline_id in details.timelines {
timelines.push(TenantTimelineId {
tenant_id,
timeline_id,
});
}
}
Ok(timelines)
}

View File

@@ -115,15 +115,8 @@ impl PagestreamClient {
pub async fn getpage(
&mut self,
key: RelTagBlockNo,
lsn: Lsn,
req: PagestreamGetPageRequest,
) -> anyhow::Result<PagestreamGetPageResponse> {
let req = PagestreamGetPageRequest {
latest: false,
rel: key.rel_tag,
blkno: key.block_no,
lsn,
};
let req = PagestreamFeMessage::GetPage(req);
let req: bytes::Bytes = req.serialize();
// let mut req = tokio_util::io::ReaderStream::new(&req);

View File

@@ -0,0 +1,26 @@
[package]
name = "pagebench"
version = "0.1.0"
edition.workspace = true
license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow.workspace = true
clap.workspace = true
futures.workspace = true
hdrhistogram.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
rand.workspace = true
serde.workspace = true
serde_json.workspace = true
tracing.workspace = true
tokio.workspace = true
pageserver = { path = ".." }
pageserver_client.workspace = true
pageserver_api.workspace = true
utils = { path = "../../libs/utils/" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1,272 @@
use anyhow::Context;
use pageserver_client::page_service::BasebackupRequest;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{debug, info, instrument};
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::ops::Range;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
/// basebackup@LatestLSN
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "localhost:64000")]
page_service_host_port: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
#[clap(long, default_value = "1.0")]
gzip_probability: f64,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
}
impl LiveStats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
struct Target {
timeline: TenantTimelineId,
lsn_range: Option<Range<Lsn>>,
}
#[derive(serde::Serialize)]
struct Output {
total: request_stats::Output,
}
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let timeline = *timeline;
// FIXME: this triggers initial logical size calculation
// https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
async move {
anyhow::Ok(Target {
timeline,
// TODO: support lsn_range != latest LSN
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
})
}
});
}
let mut all_targets: Vec<Target> = Vec::new();
while let Some(res) = js.join_next().await {
all_targets.push(res.unwrap().unwrap());
}
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
));
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
tokio::spawn({
let stats = Arc::clone(&live_stats);
let start_work_barrier = Arc::clone(&start_work_barrier);
async move {
start_work_barrier.wait().await;
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
info!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
)));
}
let work_sender = async move {
start_work_barrier.wait().await;
loop {
let (timeline, work) = {
let mut rng = rand::thread_rng();
let target = all_targets.choose(&mut rng).unwrap();
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
(
target.timeline,
Work {
lsn,
gzip: rng.gen_bool(args.gzip_probability),
},
)
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
sender.send(work).await.ok().unwrap();
}
};
if let Some(runtime) = args.runtime {
match tokio::time::timeout(runtime.into(), work_sender).await {
Ok(()) => unreachable!("work sender never terminates"),
Err(_timeout) => {
// this implicitly drops the work_senders, making all the clients exit
}
}
} else {
work_sender.await;
unreachable!("work sender never terminates");
}
for t in tasks {
t.await.unwrap();
}
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
for stats in all_thread_local_stats.lock().unwrap().iter() {
let stats = stats.lock().unwrap();
agg_stats.add(&stats);
}
agg_stats.output()
},
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
anyhow::Ok(())
}
#[derive(Copy, Clone)]
struct Work {
lsn: Option<Lsn>,
gzip: bool,
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<Work>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
&args.page_service_host_port,
args.pageserver_jwt.as_deref(),
))
.await
.unwrap();
while let Some(Work { lsn, gzip }) = work.recv().await {
let start = Instant::now();
let copy_out_stream = client
.basebackup(&BasebackupRequest {
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
lsn,
gzip,
})
.await
.with_context(|| format!("start basebackup for {timeline}"))
.unwrap();
use futures::StreamExt;
let size = Arc::new(AtomicUsize::new(0));
copy_out_stream
.for_each({
|r| {
let size = Arc::clone(&size);
async move {
let size = Arc::clone(&size);
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
}
}
})
.await;
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -0,0 +1,351 @@
use anyhow::Context;
use futures::future::join_all;
use pageserver::pgdatadir_mapping::key_to_rel_block;
use pageserver::repository;
use pageserver_api::key::is_rel_block_key;
use pageserver_api::models::PagestreamGetPageRequest;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{info, instrument};
use std::collections::HashMap;
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long)]
per_target_rate_limit: Option<usize>,
/// Probability for sending `latest=true` in the request (uniform distribution).
#[clap(long, default_value = "1")]
req_latest_probability: f64,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
}
impl LiveStats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
#[derive(Clone)]
struct KeyRange {
timeline: TenantTimelineId,
timeline_lsn: Lsn,
start: i128,
end: i128,
}
impl KeyRange {
fn len(&self) -> i128 {
self.end - self.start
}
}
#[derive(serde::Serialize)]
struct Output {
total: request_stats::Output,
}
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let mgmt_api_client = Arc::clone(&mgmt_api_client);
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(timeline.tenant_id, timeline.timeline_id)
.await?;
let lsn = partitioning.at_lsn;
let ranges = partitioning
.keys
.ranges
.iter()
.filter_map(|r| {
let start = r.start;
let end = r.end;
// filter out non-relblock keys
match (is_rel_block_key(&start), is_rel_block_key(&end)) {
(true, true) => Some(KeyRange {
timeline,
timeline_lsn: lsn,
start: start.to_i128(),
end: end.to_i128(),
}),
(true, false) | (false, true) => {
unimplemented!("split up range")
}
(false, false) => None,
}
})
.collect::<Vec<_>>();
anyhow::Ok(ranges)
}
});
}
let mut all_ranges: Vec<KeyRange> = Vec::new();
while let Some(res) = js.join_next().await {
all_ranges.extend(res.unwrap().unwrap());
}
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
));
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
tokio::spawn({
let stats = Arc::clone(&live_stats);
let start_work_barrier = Arc::clone(&start_work_barrier);
async move {
start_work_barrier.wait().await;
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
info!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
)));
}
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
None => Box::pin(async move {
let weights = rand::distributions::weighted::WeightedIndex::new(
all_ranges.iter().map(|v| v.len()),
)
.unwrap();
start_work_barrier.wait().await;
loop {
let (timeline, req) = {
let mut rng = rand::thread_rng();
let r = &all_ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
(
r.timeline,
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
},
)
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
sender.send(req).await.ok().unwrap();
}
}),
Some(rps_limit) => Box::pin(async move {
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
let make_timeline_task: &dyn Fn(
TenantTimelineId,
)
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
let sender = work_senders.get(&timeline).unwrap();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == timeline)
.cloned()
.collect();
let weights = rand::distributions::weighted::WeightedIndex::new(
ranges.iter().map(|v| v.len()),
)
.unwrap();
Box::pin(async move {
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(
/* TODO review this choice */
tokio::time::MissedTickBehavior::Burst,
);
loop {
ticker.tick().await;
let req = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) = key_to_rel_block(key)
.expect("we filter non-rel-block keys out above");
PagestreamGetPageRequest {
latest: rng.gen_bool(args.req_latest_probability),
lsn: r.timeline_lsn,
rel: rel_tag,
blkno: block_no,
}
};
sender.send(req).await.ok().unwrap();
}
})
};
let tasks: Vec<_> = work_senders
.keys()
.map(|tl| make_timeline_task(**tl))
.collect();
start_work_barrier.wait().await;
join_all(tasks).await;
}),
};
if let Some(runtime) = args.runtime {
match tokio::time::timeout(runtime.into(), work_sender).await {
Ok(()) => unreachable!("work sender never terminates"),
Err(_timeout) => {
// this implicitly drops the work_senders, making all the clients exit
}
}
} else {
work_sender.await;
unreachable!("work sender never terminates");
}
for t in tasks {
t.await.unwrap();
}
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
for stats in all_thread_local_stats.lock().unwrap().iter() {
let stats = stats.lock().unwrap();
agg_stats.add(&stats);
}
agg_stats.output()
},
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
anyhow::Ok(())
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
while let Some(req) = work.recv().await {
let start = Instant::now();
client
.getpage(req)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -0,0 +1,85 @@
use std::sync::Arc;
use humantime::Duration;
use tokio::task::JoinSet;
use utils::id::TenantTimelineId;
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "localhost:64000")]
page_service_host_port: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(
long,
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
)]
poll_for_completion: Option<Duration>,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(args));
rt.block_on(main_task).unwrap()
}
async fn main_impl(args: Args) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
// kick it off
let mut js = JoinSet::new();
for tl in timelines {
let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move {
// TODO: API to explicitly trigger initial logical size computation.
// Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
// => https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
if let Some(period) = args.poll_for_completion {
let mut ticker = tokio::time::interval(period.into());
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
let mut info = info;
while !info.current_logical_size_is_accurate {
ticker.tick().await;
info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
}
}
});
}
while let Some(res) = js.join_next().await {
let _: () = res.unwrap();
}
Ok(())
}

View File

@@ -0,0 +1,48 @@
use clap::Parser;
use utils::logging;
/// Re-usable pieces of code that aren't CLI-specific.
mod util {
pub(crate) mod connstring;
pub(crate) mod request_stats;
#[macro_use]
pub(crate) mod tokio_thread_local_stats;
/// Re-usable pieces of CLI-specific code.
pub(crate) mod cli {
pub(crate) mod targets;
}
}
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
mod cmd {
pub(super) mod basebackup;
pub(super) mod getpage_latest_lsn;
pub(super) mod trigger_initial_size_calculation;
}
/// Component-level performance test for pageserver.
#[derive(clap::Parser)]
enum Args {
Basebackup(cmd::basebackup::Args),
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
}
fn main() {
logging::init(
logging::LogFormat::Plain,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stderr,
)
.unwrap();
let args = Args::parse();
match args {
Args::Basebackup(args) => cmd::basebackup::main(args),
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
Args::TriggerInitialSizeCalculation(args) => {
cmd::trigger_initial_size_calculation::main(args)
}
}
.unwrap()
}

View File

@@ -0,0 +1,34 @@
use std::sync::Arc;
use pageserver_client::mgmt_api;
use tracing::info;
use utils::id::TenantTimelineId;
pub(crate) struct Spec {
pub(crate) limit_to_first_n_targets: Option<usize>,
pub(crate) targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) async fn discover(
api_client: &Arc<mgmt_api::Client>,
spec: Spec,
) -> anyhow::Result<Vec<TenantTimelineId>> {
let mut timelines = if let Some(targets) = spec.targets {
targets
} else {
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
};
if let Some(limit) = spec.limit_to_first_n_targets {
timelines.sort(); // for determinism
timelines.truncate(limit);
if timelines.len() < limit {
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
}
}
info!("timelines:\n{:?}", timelines);
info!("number of timelines:\n{:?}", timelines.len());
Ok(timelines)
}

View File

@@ -0,0 +1,8 @@
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
let colon_and_jwt = if let Some(jwt) = jwt {
format!(":{jwt}") // TODO: urlescape
} else {
String::new()
};
format!("postgres://postgres{colon_and_jwt}@{host_port}")
}

View File

@@ -0,0 +1,88 @@
use std::time::Duration;
use anyhow::Context;
pub(crate) struct Stats {
latency_histo: hdrhistogram::Histogram<u64>,
}
impl Stats {
pub(crate) fn new() -> Self {
Self {
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
// which would skew the benchmark results.
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
}
}
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
let micros: u64 = latency
.as_micros()
.try_into()
.context("latency greater than u64")?;
self.latency_histo
.record(micros)
.context("add to histogram")?;
Ok(())
}
pub(crate) fn output(&self) -> Output {
let latency_percentiles = std::array::from_fn(|idx| {
let micros = self
.latency_histo
.value_at_percentile(LATENCY_PERCENTILES[idx]);
Duration::from_micros(micros)
});
Output {
request_count: self.latency_histo.len(),
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
latency_percentiles: LatencyPercentiles {
latency_percentiles,
},
}
}
pub(crate) fn add(&mut self, other: &Self) {
let Self {
ref mut latency_histo,
} = self;
latency_histo.add(&other.latency_histo).unwrap();
}
}
impl Default for Stats {
fn default() -> Self {
Self::new()
}
}
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
struct LatencyPercentiles {
latency_percentiles: [Duration; 4],
}
impl serde::Serialize for LatencyPercentiles {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeMap;
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
for p in LATENCY_PERCENTILES {
ser.serialize_entry(
&format!("p{p}"),
&format!(
"{}",
&humantime::format_duration(self.latency_percentiles[0])
),
)?;
}
ser.end()
}
}
#[derive(serde::Serialize)]
pub(crate) struct Output {
request_count: u64,
#[serde(with = "humantime_serde")]
latency_mean: Duration,
latency_percentiles: LatencyPercentiles,
}

View File

@@ -0,0 +1,45 @@
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
macro_rules! declare {
($THREAD_LOCAL_NAME:ident: $T:ty) => {
thread_local! {
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
);
}
};
}
use std::sync::{Arc, Mutex};
pub(crate) use declare;
macro_rules! main {
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
let main_impl = $main_impl;
let all = Arc::new(Mutex::new(Vec::new()));
let rt = tokio::runtime::Builder::new_multi_thread()
.on_thread_start({
let all = Arc::clone(&all);
move || {
// pre-initialize the thread local stats by accessesing them
// (some stats like requests_stats::Stats are quite costly to initialize,
// we don't want to pay that cost during the measurement period)
$THREAD_LOCAL_NAME.with(|stats| {
let stats: Arc<_> = Arc::clone(&*stats.borrow());
all.lock().unwrap().push(stats);
});
}
})
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(all));
rt.block_on(main_task).unwrap()
}};
}
pub(crate) use main;

View File

@@ -23,6 +23,7 @@ use tracing::*;
use tokio_tar::{Builder, EntryType, Header};
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -174,7 +175,7 @@ where
] {
for segno in self
.timeline
.list_slru_segments(kind, self.lsn, self.ctx)
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
.await?
{
self.add_slru_segment(kind, segno).await?;
@@ -192,7 +193,7 @@ where
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
@@ -267,7 +268,7 @@ where
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, self.lsn, false, self.ctx)
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
.await?;
// If the relation is empty, create an empty file
@@ -288,7 +289,7 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -310,7 +311,7 @@ where
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -352,7 +353,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
ensure!(
@@ -399,7 +400,7 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?
.is_empty()
{

View File

@@ -31,6 +31,7 @@ use pageserver::{
virtual_file,
};
use postgres_backend::AuthType;
use utils::failpoint_support;
use utils::logging::TracingErrorLayerEnablement;
use utils::signals::ShutdownSignals;
use utils::{
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
}
// Initialize up failpoints support
let scenario = pageserver::failpoint_support::init();
let scenario = failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors);

View File

@@ -76,6 +76,8 @@ pub mod defaults {
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
///
/// Default built-in configuration file.
///
@@ -88,6 +90,7 @@ pub mod defaults {
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
# initial superuser role name to use when creating a new tenant
@@ -108,6 +111,8 @@ pub mod defaults {
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -233,6 +238,9 @@ pub struct PageServerConf {
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -314,6 +322,8 @@ struct PageServerConfigBuilder {
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
}
impl Default for PageServerConfigBuilder {
@@ -386,6 +396,8 @@ impl Default for PageServerConfigBuilder {
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
}
}
}
@@ -534,6 +546,10 @@ impl PageServerConfigBuilder {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_warmup = self
.concurrent_tenant_warmup
@@ -632,10 +648,12 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
})
}
}
@@ -878,6 +896,7 @@ impl PageServerConf {
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -949,6 +968,7 @@ impl PageServerConf {
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
}
}
}
@@ -1177,7 +1197,8 @@ background_task_maximum_delay = '334 s'
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
},
"Correct defaults should be used when no config values are provided"
);
@@ -1238,7 +1259,8 @@ background_task_maximum_delay = '334 s'
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: 100,
},
"Should be able to parse all basic config values correctly"
);
@@ -1468,6 +1490,7 @@ threshold = "20m"
period: Duration::from_secs(10),
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
})
);
match &conf.default_tenant_conf.eviction_policy {

View File

@@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig {
pub period: Duration,
#[cfg(feature = "testing")]
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
/// Select sorting for evicted layers
#[serde(default)]
pub eviction_order: EvictionOrder,
}
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
/// partitioning.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "type", content = "args")]
pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed in absolute
/// time.
///
/// This strategy is unfair when some tenants grow faster than others towards the slower
/// growing.
#[default]
AbsoluteAccessed,
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
///
/// This strategy will evict layers more fairly but is untested.
RelativeAccessed {
#[serde(default)]
highest_layer_count_loses_first: bool,
},
}
impl EvictionOrder {
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
/// counts should be the first ones to have their layers evicted.
fn highest_layer_count_loses_first(&self) -> bool {
match self {
EvictionOrder::AbsoluteAccessed => false,
EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first,
} => *highest_layer_count_loses_first,
}
}
}
#[derive(Default)]
@@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration(
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
let res = disk_usage_eviction_task_iteration_impl(
state,
storage,
usage_pre,
task_config.eviction_order,
cancel,
)
.await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
"running disk usage based eviction due to pressure"
);
let candidates = match collect_eviction_candidates(cancel).await? {
let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
@@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Debug-log the list of candidates
let now = SystemTime::now();
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let nth = i + 1;
let desc = candidate.layer.layer_desc();
let total_candidates = candidates.len();
let size = desc.file_size;
let rel = candidate.relative_last_activity;
debug!(
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
i + 1,
candidates.len(),
desc.file_size,
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
now.duration_since(candidate.last_activity_ts)
.unwrap()
.as_micros(),
partition,
desc.tenant_shard_id,
desc.timeline_id,
candidate.layer,
@@ -459,6 +506,7 @@ struct EvictionCandidate {
timeline: Arc<Timeline>,
layer: Layer,
last_activity_ts: SystemTime,
relative_last_activity: finite_f32::FiniteF32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -478,24 +526,24 @@ enum EvictionCandidates {
/// order. A caller that evicts in that order, until pressure is relieved, implements
/// the eviction policy outlined in the module comment.
///
/// # Example
/// # Example with EvictionOrder::AbsoluteAccessed
///
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
/// Each layer has size 100, and both tenant's min_resident_size is 150.
/// The eviction order would be
///
/// ```text
/// partition last_activity_ts tenant/layer
/// Above 18:30 A/c
/// Above 19:00 A/b
/// Above 18:29 B/c
/// Above 19:05 B/b
/// Above 20:00 B/a
/// Above 20:03 A/a
/// Below 20:30 A/d
/// Below 20:40 B/d
/// Below 20:45 B/e
/// Below 20:58 A/e
/// partition last_activity_ts tenant/layer
/// Above 18:30 A/c
/// Above 19:00 A/b
/// Above 18:29 B/c
/// Above 19:05 B/b
/// Above 20:00 B/a
/// Above 20:03 A/a
/// Below 20:30 A/d
/// Below 20:40 B/d
/// Below 20:45 B/e
/// Below 20:58 A/e
/// ```
///
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -505,7 +553,77 @@ enum EvictionCandidates {
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
/// after exhauting the `Above` partition.
/// So, we did not respect each tenant's min_resident_size.
///
/// # Example with EvictionOrder::RelativeAccessed
///
/// ```text
/// partition relative_age last_activity_ts tenant/layer
/// Above 0/4 18:30 A/c
/// Above 0/4 18:29 B/c
/// Above 1/4 19:00 A/b
/// Above 1/4 19:05 B/b
/// Above 2/4 20:00 B/a
/// Above 2/4 20:03 A/a
/// Below 3/4 20:30 A/d
/// Below 3/4 20:40 B/d
/// Below 4/4 20:45 B/e
/// Below 4/4 20:58 A/e
/// ```
///
/// With tenants having the same number of layers the picture does not change much. The same with
/// A having many more layers **resident** (not all of them listed):
///
/// ```text
/// Above 0/100 18:30 A/c
/// Above 0/4 18:29 B/c
/// Above 1/100 19:00 A/b
/// Above 2/100 20:03 A/a
/// Above 3/100 20:03 A/nth_3
/// Above 4/100 20:03 A/nth_4
/// ...
/// Above 1/4 19:05 B/b
/// Above 25/100 20:04 A/nth_25
/// ...
/// Above 2/4 20:00 B/a
/// Above 50/100 20:10 A/nth_50
/// ...
/// Below 3/4 20:40 B/d
/// Below 99/100 20:30 A/nth_99
/// Below 4/4 20:45 B/e
/// Below 100/100 20:58 A/nth_100
/// ```
///
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
/// appeared:
///
/// ```text
/// Above 0/87 20:04 A/nth_23
/// Above 0/3 19:05 B/b
/// Above 0/50 20:59 C/nth_0
/// Above 1/87 20:04 A/nth_24
/// Above 1/50 21:00 C/nth_1
/// Above 2/87 20:04 A/nth_25
/// ...
/// Above 16/50 21:02 C/nth_16
/// Above 1/3 20:00 B/a
/// Above 27/87 20:10 A/nth_50
/// ...
/// Below 2/3 20:40 B/d
/// Below 49/50 21:05 C/nth_49
/// Below 86/87 20:30 A/nth_99
/// Below 3/3 20:45 B/e
/// Below 50/50 21:05 C/nth_50
/// Below 87/87 20:58 A/nth_100
/// ```
///
/// Now relieving pressure with 23 layers would cost:
/// - tenant A 14 layers
/// - tenant B 1 layer
/// - tenant C 8 layers
async fn collect_eviction_candidates(
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
// get a snapshot of the list of tenants
@@ -591,12 +709,63 @@ async fn collect_eviction_candidates(
tenant_candidates
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
for (timeline, layer_info) in tenant_candidates.into_iter() {
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if eviction_order.highest_layer_count_loses_first() {
// relative_age vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = tenant_candidates
.len()
.checked_sub(fudge)
.filter(|&x| x > 0)
// support 0 or 1 resident layer tenants as well
.unwrap_or(1);
let divider = total as f32;
for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
let file_size = layer_info.file_size();
// as we iterate this reverse sorted list, the most recently accessed layer will always
// be 1.0; this is for us to evict it last.
let relative_last_activity = if matches!(
eviction_order,
EvictionOrder::RelativeAccessed { .. }
) {
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
// similarly for u16. unsure how it would help.
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
} else {
finite_f32::FiniteF32::ZERO
};
let candidate = EvictionCandidate {
timeline,
last_activity_ts: layer_info.last_activity_ts,
layer: layer_info.layer,
relative_last_activity,
};
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
@@ -610,8 +779,19 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
match eviction_order {
EvictionOrder::AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
EvictionOrder::RelativeAccessed { .. } => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
}
}
Ok(EvictionCandidates::Finished(candidates))
}
@@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey {
}
}
/// A totally ordered f32 subset we can use with sorting functions.
mod finite_f32 {
/// A totally ordered f32 subset we can use with sorting functions.
#[derive(Clone, Copy, PartialEq)]
pub struct FiniteF32(f32);
impl std::fmt::Debug for FiniteF32 {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(&self.0, f)
}
}
impl std::fmt::Display for FiniteF32 {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Display::fmt(&self.0, f)
}
}
impl std::cmp::Eq for FiniteF32 {}
impl std::cmp::PartialOrd for FiniteF32 {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for FiniteF32 {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.0.total_cmp(&other.0)
}
}
impl TryFrom<f32> for FiniteF32 {
type Error = f32;
fn try_from(value: f32) -> Result<Self, Self::Error> {
if value.is_finite() {
Ok(FiniteF32(value))
} else {
Err(value)
}
}
}
impl FiniteF32 {
pub const ZERO: FiniteF32 = FiniteF32(0.0);
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
if (0.0..=1.0).contains(&value) {
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
let value = value.abs();
Ok(FiniteF32(value))
} else {
Err(value)
}
}
}
}
mod filesystem_level_usage {
use anyhow::Context;
use camino::Utf8Path;
@@ -721,6 +961,7 @@ mod filesystem_level_usage {
#[test]
fn max_usage_pct_pressure() {
use super::EvictionOrder;
use super::Usage as _;
use std::time::Duration;
use utils::serde_percent::Percent;
@@ -732,6 +973,7 @@ mod filesystem_level_usage {
period: Duration::MAX,
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: EvictionOrder::default(),
},
total_bytes: 100_000,
avail_bytes: 0,

View File

@@ -159,6 +159,12 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"412":
description: Deletion may not proceed, tenant is not in Active state
content:
application/json:
schema:
$ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:

View File

@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::request_span;
use utils::http::json::json_request_or_empty_body;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -66,9 +67,6 @@ use utils::{
lsn::Lsn,
};
// Imports only used for testing APIs
use pageserver_api::models::ConfigureFailpointsRequest;
// For APIs that require an Active tenant, how long should we block waiting for that state?
// This is not functionally necessary (clients will retry), but avoids generating a lot of
// failed API calls while tenants are activating.
@@ -308,6 +306,7 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
SlotUpsertError(e) => e.into(),
Other(o) => ApiError::InternalServerError(o),
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
Cancelled => ApiError::ShuttingDown,
}
}
}
@@ -888,7 +887,9 @@ async fn tenant_delete_handler(
let state = get_state(&request);
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
state
.tenant_manager
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
.instrument(info_span!("tenant_delete_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard = %tenant_shard_id.shard_slug()
@@ -1292,34 +1293,6 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
"Cannot manage failpoints because pageserver was compiled without failpoints support"
)));
}
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
for fp in failpoints {
info!("cfg failpoint: {} {}", fp.name, fp.actions);
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow!(
"Failed to configure failpoints: {err_msg}"
)));
}
}
json_response(StatusCode::OK, ())
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
@@ -1568,19 +1541,22 @@ async fn disk_usage_eviction_run(
struct Config {
/// How many bytes to evict before reporting that pressure is relieved.
evict_bytes: u64,
#[serde(default)]
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
}
#[derive(Debug, Clone, Copy, serde::Serialize)]
struct Usage {
// remains unchanged after instantiation of the struct
config: Config,
evict_bytes: u64,
// updated by `add_available_bytes`
freed_bytes: u64,
}
impl crate::disk_usage_eviction_task::Usage for Usage {
fn has_pressure(&self) -> bool {
self.config.evict_bytes > self.freed_bytes
self.evict_bytes > self.freed_bytes
}
fn add_available_bytes(&mut self, bytes: u64) {
@@ -1591,7 +1567,7 @@ async fn disk_usage_eviction_run(
let config = json_request::<Config>(&mut r).await?;
let usage = Usage {
config,
evict_bytes: config.evict_bytes,
freed_bytes: 0,
};
@@ -1606,7 +1582,11 @@ async fn disk_usage_eviction_run(
let state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state, storage, usage, &cancel,
&state,
storage,
usage,
config.eviction_order,
&cancel,
)
.await;

View File

@@ -21,6 +21,7 @@ use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
@@ -312,13 +313,16 @@ async fn import_wal(
waldecoder.feed_bytes(&buf);
let mut nrecords = 0;
let mut modification = tline.begin_modification(endpoint);
let mut modification = tline.begin_modification(last_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
WAL_INGEST.records_committed.inc();
modification.commit(ctx).await?;
last_lsn = lsn;
nrecords += 1;
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(end_lsn);
let mut modification = tline.begin_modification(last_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
modification.commit(ctx).await?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);

View File

@@ -25,8 +25,6 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
pub mod failpoint_support;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;

View File

@@ -53,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::rel_block_to_key;
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -747,7 +747,7 @@ impl PageServerHandler {
.await?;
let exists = timeline
.get_rel_exists(req.rel, lsn, req.latest, ctx)
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -766,7 +766,9 @@ impl PageServerHandler {
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
let n_blocks = timeline
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
@@ -785,7 +787,13 @@ impl PageServerHandler {
.await?;
let total_blocks = timeline
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::Lsn(lsn),
req.latest,
ctx,
)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -816,7 +824,7 @@ impl PageServerHandler {
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?
} else {
// The Tenant shard we looked up at connection start does not hold this particular
@@ -853,7 +861,7 @@ impl PageServerHandler {
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?
};

View File

@@ -11,7 +11,7 @@ use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::Context;
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,6 +147,7 @@ impl Timeline {
{
DatadirModification {
tline: self,
pending_lsns: Vec::new(),
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
@@ -163,7 +164,7 @@ impl Timeline {
&self,
tag: RelTag,
blknum: BlockNumber,
lsn: Lsn,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
@@ -173,17 +174,20 @@ impl Timeline {
));
}
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag, blknum, lsn, nblocks
tag,
blknum,
version.get_lsn(),
nblocks
);
return Ok(ZERO_PAGE.clone());
}
let key = rel_block_to_key(tag, blknum);
self.get(key, lsn, ctx).await
version.get(self, key, ctx).await
}
// Get size of a database in blocks
@@ -191,16 +195,16 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<usize, PageReconstructError> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
for rel in rels {
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
@@ -210,7 +214,7 @@ impl Timeline {
pub async fn get_rel_size(
&self,
tag: RelTag,
lsn: Lsn,
version: Version<'_>,
latest: bool,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
@@ -220,12 +224,12 @@ impl Timeline {
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(nblocks);
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
&& !self.get_rel_exists(tag, version, latest, ctx).await?
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
@@ -235,7 +239,7 @@ impl Timeline {
}
let key = rel_size_to_key(tag);
let mut buf = self.get(key, lsn, ctx).await?;
let mut buf = version.get(self, key, ctx).await?;
let nblocks = buf.get_u32_le();
if latest {
@@ -246,7 +250,7 @@ impl Timeline {
// latest=true, then it can not cause cache corruption, because with latest=true
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
// associated with most recent value of LSN.
self.update_cached_rel_size(tag, lsn, nblocks);
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
}
Ok(nblocks)
}
@@ -255,7 +259,7 @@ impl Timeline {
pub async fn get_rel_exists(
&self,
tag: RelTag,
lsn: Lsn,
version: Version<'_>,
_latest: bool,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
@@ -266,12 +270,12 @@ impl Timeline {
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = self.get(key, lsn, ctx).await?;
let buf = version.get(self, key, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -291,12 +295,12 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<HashSet<RelTag>, PageReconstructError> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = self.get(key, lsn, ctx).await?;
let buf = version.get(self, key, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -332,11 +336,11 @@ impl Timeline {
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(key, lsn, ctx).await?;
let mut buf = version.get(self, key, ctx).await?;
Ok(buf.get_u32_le())
}
@@ -345,12 +349,12 @@ impl Timeline {
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn, ctx).await?;
let buf = version.get(self, key, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -501,11 +505,11 @@ impl Timeline {
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
) -> Result<T, PageReconstructError> {
for segno in self
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
.await?
{
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
@@ -531,13 +535,13 @@ impl Timeline {
pub async fn list_slru_segments(
&self,
kind: SlruKind,
lsn: Lsn,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<HashSet<u32>, PageReconstructError> {
// fetch directory entry
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn, ctx).await?;
let buf = version.get(self, key, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.segments),
Err(e) => Err(PageReconstructError::from(e)),
@@ -548,12 +552,12 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
version: Version<'_>,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
let key = relmap_file_key(spcnode, dbnode);
let buf = self.get(key, lsn, ctx).await?;
let buf = version.get(self, key, ctx).await?;
Ok(buf)
}
@@ -652,7 +656,10 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
for rel in self
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
.await?
{
if self.cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
@@ -692,7 +699,7 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, lsn, ctx)
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
.await?
.into_iter()
.collect();
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
/// in the state in 'tline' yet.
pub tline: &'a Timeline,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
/// Current LSN of the modification
lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
pending_updates: HashMap<Key, Value>,
pending_deletions: Vec<Range<Key>>,
pending_lsns: Vec<Lsn>,
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
pending_deletions: Vec<(Range<Key>, Lsn)>,
pending_nblocks: i64,
}
impl<'a> DatadirModification<'a> {
/// Get the current lsn
pub(crate) fn get_lsn(&self) -> Lsn {
self.lsn
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
lsn >= self.lsn,
"setting an older lsn {} than {} is not allowed",
lsn,
self.lsn
);
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
}
Ok(())
}
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
dbnode: Oid,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
.await?;
// Remove entry from dbdir
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
if self
.tline
.get_rel_exists(rel, Version::Modified(self), true, ctx)
.await?
{
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
let writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
} else {
retained_pending_updates.insert(key, value);
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
for (key, values) in self.pending_updates.drain() {
for (lsn, value) in values {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, lsn, &value, ctx).await?;
} else {
retained_pending_updates
.entry(key)
.or_default()
.push((lsn, value));
}
}
}
self.pending_updates.extend(retained_pending_updates);
self.pending_updates = retained_pending_updates;
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value, ctx).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;
if !self.pending_updates.is_empty() {
writer.put_batch(&self.pending_updates, ctx).await?;
self.pending_updates.clear();
}
writer.finish_write(lsn);
if !self.pending_deletions.is_empty() {
writer.delete_batch(&self.pending_deletions).await?;
self.pending_deletions.clear();
}
self.pending_lsns.push(self.lsn);
for pending_lsn in self.pending_lsns.drain(..) {
// Ideally, we should be able to call writer.finish_write() only once
// with the highest LSN. However, the last_record_lsn variable in the
// timeline keeps track of the latest LSN and the immediate previous LSN
// so we need to record every LSN to not leave a gap between them.
writer.finish_write(pending_lsn);
}
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub(crate) fn is_empty(&self) -> bool {
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
pub(crate) fn len(&self) -> usize {
self.pending_updates.len() + self.pending_deletions.len()
}
// Internal helper functions to batch the modifications
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
// Have we already updated the same key? Read the pending updated
// Have we already updated the same key? Read the latest pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
if let Some(value) = self.pending_updates.get(&key) {
if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
if let Some(values) = self.pending_updates.get(&key) {
if let Some((_, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
};
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
fn put(&mut self, key: Key, val: Value) {
self.pending_updates.insert(key, val);
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
*last_value = val;
return;
}
}
values.push((self.lsn, val));
}
fn delete(&mut self, key_range: Range<Key>) {
trace!("DELETE {}-{}", key_range.start, key_range.end);
self.pending_deletions.push(key_range);
self.pending_deletions.push((key_range, self.lsn));
}
}
/// This struct facilitates accessing either a committed key from the timeline at a
/// specific LSN, or the latest uncommitted key from a pending modification.
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
/// need to look up the keys in the modification first before looking them up in the
/// timeline to not miss the latest updates.
#[derive(Clone, Copy)]
pub enum Version<'a> {
Lsn(Lsn),
Modified(&'a DatadirModification<'a>),
}
impl<'a> Version<'a> {
async fn get(
&self,
timeline: &Timeline,
key: Key,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
match self {
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
Version::Modified(modification) => modification.get(key, ctx).await,
}
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,
Version::Modified(modification) => modification.lsn,
}
}
}
@@ -1776,6 +1863,7 @@ pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
@@ -1790,7 +1878,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}

View File

@@ -33,6 +33,7 @@ use tracing::*;
use utils::backoff;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::failpoint_support;
use utils::fs_ext;
use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
@@ -890,7 +891,7 @@ impl Tenant {
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
failpoint_support::sleep_millis_async!("before-attaching-tenant");
let preload = match preload {
Some(p) => p,
@@ -1002,7 +1003,7 @@ impl Tenant {
// IndexPart is the source of truth.
self.clean_up_timelines(&existent_timelines)?;
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
failpoint_support::sleep_millis_async!("attach-before-activate");
info!("Done");
@@ -2843,9 +2844,7 @@ impl Tenant {
}
};
crate::failpoint_support::sleep_millis_async!(
"gc_iteration_internal_after_getting_gc_timelines"
);
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
// If there is nothing to GC, we don't want any messages in the INFO log.
if !gc_timelines.is_empty() {
@@ -3138,6 +3137,7 @@ impl Tenant {
/// For unit tests, make this visible so that other modules can directly create timelines
#[cfg(test)]
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn bootstrap_timeline_test(
&self,
timeline_id: TimelineId,

View File

@@ -46,6 +46,8 @@ pub mod defaults {
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]

View File

@@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError {
#[error("Timeline {0}")]
Timeline(#[from] DeleteTimelineError),
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}

View File

@@ -514,10 +514,7 @@ pub async fn init_tenant_mgr(
&ctx,
) {
Ok(tenant) => {
tenants.insert(
TenantShardId::unsharded(tenant.tenant_id()),
TenantSlot::Attached(tenant),
);
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -962,35 +959,27 @@ impl TenantManager {
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
// Directory structure is the same for attached and secondary modes:
// create it if it doesn't exist. Timeline load/creation expects the
// timelines/ subdir to already exist.
//
// Does not need to be fsync'd because local storage is just a cache.
tokio::fs::create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
// Before activating either secondary or attached mode, persist the
// configuration, so that on restart we will re-attach (or re-start
// secondary) on the tenant.
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => {
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
tokio::fs::create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
}
LocationMode::Secondary(_) => TenantSlot::Secondary,
LocationMode::Attached(_attach_config) => {
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
tokio::fs::create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let shard_identity = new_location_config.shard;
let tenant = tenant_spawn(
self.conf,
@@ -1102,6 +1091,71 @@ impl TenantManager {
.collect(),
}
}
pub(crate) async fn delete_tenant(
&self,
tenant_shard_id: TenantShardId,
activation_timeout: Duration,
) -> Result<(), DeleteTenantError> {
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
// have to return the Tenant to the map while the background deletion runs.
//
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
// Currently, deletion requires a reference to the tenants map in order to
// keep the Tenant in the map until deletion is complete, and then remove
// it at the end.
//
// See https://github.com/neondatabase/neon/issues/5080
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
// unwrap is safe because we used MustExist mode when acquiring
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
TenantSlot::Attached(tenant) => tenant.clone(),
_ => {
// Express "not attached" as equivalent to "not found"
return Err(DeleteTenantError::NotAttached);
}
};
match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If a tenant is broken or stopping, DeleteTenantFlow can
// handle it: broken tenants proceed to delete, stopping tenants
// are checked for deletion already in progress.
}
_ => {
tenant
.wait_to_become_active(activation_timeout)
.await
.map_err(|e| match e {
GetActiveTenantError::WillNotBecomeActive(_) => {
DeleteTenantError::InvalidState(tenant.current_state())
}
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
GetActiveTenantError::WaitForActiveTimeout {
latest_state: _latest_state,
wait_time: _wait_time,
} => DeleteTenantError::InvalidState(tenant.current_state()),
})?;
}
}
let result = DeleteTenantFlow::run(
self.conf,
self.resources.remote_storage.clone(),
&TENANTS,
tenant,
)
.await;
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
slot_guard.revert();
result
}
}
#[derive(Debug, thiserror::Error)]
@@ -1279,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout(
Ok(tenant)
}
pub(crate) async fn delete_tenant(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
// have to return the Tenant to the map while the background deletion runs.
//
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
// Currently, deletion requires a reference to the tenants map in order to
// keep the Tenant in the map until deletion is complete, and then remove
// it at the end.
//
// See https://github.com/neondatabase/neon/issues/5080
// TODO(sharding): make delete API sharding-aware
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
// unwrap is safe because we used MustExist mode when acquiring
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
TenantSlot::Attached(tenant) => tenant.clone(),
_ => {
// Express "not attached" as equivalent to "not found"
return Err(DeleteTenantError::NotAttached);
}
};
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
slot_guard.revert();
result
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum DeleteTimelineError {
#[error("Tenant {0}")]

View File

@@ -2192,15 +2192,6 @@ mod tests {
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
timeline_path
.strip_prefix(&test_state.harness.conf.workdir)
.unwrap(),
);
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(
&test_state.harness.tenant_shard_id,
@@ -2209,6 +2200,10 @@ mod tests {
)
.get_path(),
);
std::fs::create_dir_all(index_path.parent().unwrap())
.expect("creating test dir should work");
eprintln!("Writing {index_path}");
std::fs::write(&index_path, index_part_bytes).unwrap();
example_index_part

View File

@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::RwLock;
use tokio::sync::{RwLock, RwLockWriteGuard};
use super::{DeltaLayerWriter, ResidentLayer};
@@ -246,16 +246,43 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub async fn put_value(
pub(crate) async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
let mut inner = self.inner.write().await;
self.assert_writable();
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
}
pub(crate) async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
for (key, vals) in values {
for (lsn, val) in vals {
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
.await?;
}
}
Ok(())
}
async fn put_value_locked(
&self,
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let off = {
// Avoid doing allocations for "small" values.
@@ -264,7 +291,7 @@ impl InMemoryLayer {
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
inner
locked_inner
.file
.write_blob(
&buf,
@@ -275,7 +302,7 @@ impl InMemoryLayer {
.await?
};
let vec_map = inner.index.entry(key).or_default();
let vec_map = locked_inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
@@ -285,13 +312,11 @@ impl InMemoryLayer {
Ok(())
}
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())
}
/// Make the layer non-writeable. Only call once.
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive
pub async fn freeze(&self, end_lsn: Lsn) {

View File

@@ -1459,6 +1459,7 @@ impl Timeline {
max_lsn_wal_lag,
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: self.conf.availability_zone.clone(),
ingest_batch_size: self.conf.ingest_batch_size,
},
broker_client,
ctx,
@@ -2471,9 +2472,27 @@ impl Timeline {
Ok(())
}
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
let layer = self.get_layer_for_write(lsn).await?;
layer.put_tombstone(key_range, lsn).await?;
async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Pick the first LSN in the batch to get the layer to write to.
for lsns in values.values() {
if let Some((lsn, _)) = lsns.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_values(values, ctx).await?;
break;
}
}
Ok(())
}
async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
if let Some((_, lsn)) = tombstones.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_tombstones(tombstones).await?;
}
Ok(())
}
@@ -4529,8 +4548,16 @@ impl<'a> TimelineWriter<'a> {
self.tl.put_value(key, lsn, value, ctx).await
}
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
self.tl.put_tombstone(key_range, lsn).await
pub(crate) async fn put_batch(
&self,
batch: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.tl.put_values(batch, ctx).await
}
pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
self.tl.put_tombstones(batch).await
}
/// Track the end of the latest digested WAL record.
@@ -4541,11 +4568,11 @@ impl<'a> TimelineWriter<'a> {
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
/// the latest and can be read.
pub fn finish_write(&self, new_lsn: Lsn) {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
self.tl.finish_write(new_lsn);
}
pub fn update_current_logical_size(&self, delta: i64) {
pub(crate) fn update_current_logical_size(&self, delta: i64) {
self.tl.update_current_logical_size(delta)
}
}

View File

@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
pub max_lsn_wal_lag: NonZeroU64,
pub auth_token: Option<Arc<String>>,
pub availability_zone: Option<String>,
pub ingest_batch_size: u64,
}
pub struct WalReceiver {

View File

@@ -411,6 +411,7 @@ impl ConnectionManagerState {
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let ingest_batch_size = self.conf.ingest_batch_size;
let timeline = Arc::clone(&self.timeline);
let ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionHandler,
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
connect_timeout,
ctx,
node_id,
ingest_batch_size,
)
.await;
@@ -1345,6 +1347,7 @@ mod tests {
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
auth_token: None,
availability_zone: None,
ingest_batch_size: 1,
},
wal_connection: None,
wal_stream_candidates: HashMap::new(),

View File

@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
/// Open a connection to the given safekeeper and receive WAL, sending back progress
/// messages as we go.
#[allow(clippy::too_many_arguments)]
pub(super) async fn handle_walreceiver_connection(
timeline: Arc<Timeline>,
wal_source_connconf: PgConnectionConfig,
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
connect_timeout: Duration,
ctx: RequestContext,
node: NodeId,
ingest_batch_size: u64,
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
{
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(endlsn);
let mut modification = timeline.begin_modification(startlsn);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
walingest
// Ingest the records without immediately committing them.
let ingested = walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
fail_point!("walreceiver-after-ingest");
last_rec_lsn = lsn;
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.
uncommitted_records += 1;
if uncommitted_records >= ingest_batch_size {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
uncommitted_records = 0;
filtered_records = 0;
}
}
// Commit the remaining records.
if uncommitted_records > 0 {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
}
}

View File

@@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use anyhow::{bail, Context, Result};
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use utils::failpoint_support;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
@@ -47,20 +48,18 @@ use postgres_ffi::TransactionId;
use postgres_ffi::BLCKSZ;
use utils::lsn::Lsn;
pub struct WalIngest<'a> {
pub struct WalIngest {
shard: ShardIdentity,
timeline: &'a Timeline,
checkpoint: CheckPoint,
checkpoint_modified: bool,
}
impl<'a> WalIngest<'a> {
impl WalIngest {
pub async fn new(
timeline: &'a Timeline,
timeline: &Timeline,
startpoint: Lsn,
ctx: &'_ RequestContext,
) -> anyhow::Result<WalIngest<'a>> {
ctx: &RequestContext,
) -> anyhow::Result<WalIngest> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -69,7 +68,6 @@ impl<'a> WalIngest<'a> {
Ok(WalIngest {
shard: *timeline.get_shard_identity(),
timeline,
checkpoint,
checkpoint_modified: false,
})
@@ -83,6 +81,8 @@ impl<'a> WalIngest<'a> {
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
///
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
///
pub async fn ingest_record(
&mut self,
recdata: Bytes,
@@ -90,11 +90,13 @@ impl<'a> WalIngest<'a> {
modification: &mut DatadirModification<'_>,
decoded: &mut DecodedWALRecord,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> anyhow::Result<bool> {
WAL_INGEST.records_received.inc();
let pg_version = modification.tline.pg_version;
let prev_len = modification.len();
modification.lsn = lsn;
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
modification.set_lsn(lsn)?;
decode_wal_record(recdata, decoded, pg_version)?;
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
@@ -131,9 +133,9 @@ impl<'a> WalIngest<'a> {
}
pg_constants::RM_DBASE_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
debug!(%info, %pg_version, "handle RM_DBASE_ID");
if self.timeline.pg_version == 14 {
if pg_version == 14 {
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
@@ -149,7 +151,7 @@ impl<'a> WalIngest<'a> {
.await?;
}
}
} else if self.timeline.pg_version == 15 {
} else if pg_version == 15 {
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -169,7 +171,7 @@ impl<'a> WalIngest<'a> {
.await?;
}
}
} else if self.timeline.pg_version == 16 {
} else if pg_version == 16 {
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -344,9 +346,7 @@ impl<'a> WalIngest<'a> {
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
crate::failpoint_support::sleep_millis_async!(
"wal-ingest-logical-message-sleep"
);
failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
modification.put_file(path, message, ctx).await?;
}
@@ -400,19 +400,11 @@ impl<'a> WalIngest<'a> {
self.checkpoint_modified = false;
}
if modification.is_empty() {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
modification.tline.finish_write(lsn);
} else {
WAL_INGEST.records_committed.inc();
modification.commit(ctx).await?;
}
// Note that at this point this record is only cached in the modification
// until commit() is called to flush the data into the repository and update
// the latest LSN.
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN.
Ok(())
Ok(modification.len() > prev_len)
}
/// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -459,7 +451,7 @@ impl<'a> WalIngest<'a> {
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
{
@@ -512,7 +504,7 @@ impl<'a> WalIngest<'a> {
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
match self.timeline.pg_version {
match modification.tline.pg_version {
14 => {
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -736,7 +728,7 @@ impl<'a> WalIngest<'a> {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -817,10 +809,11 @@ impl<'a> WalIngest<'a> {
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
let pg_version = modification.tline.pg_version;
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
match self.timeline.pg_version {
match pg_version {
16 => {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -883,7 +876,7 @@ impl<'a> WalIngest<'a> {
}
_ => bail!(
"Neon RMGR has no known compatibility with PostgreSQL version {}",
self.timeline.pg_version
pg_version
),
}
@@ -906,7 +899,7 @@ impl<'a> WalIngest<'a> {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -984,16 +977,14 @@ impl<'a> WalIngest<'a> {
let src_db_id = rec.src_db_id;
let src_tablespace_id = rec.src_tablespace_id;
// Creating a database is implemented by copying the template (aka. source) database.
// To copy all the relations, we need to ask for the state as of the same LSN, but we
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
// get calls instead.
let req_lsn = modification.tline.get_last_record_lsn();
let rels = modification
.tline
.list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
.list_rels(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.await?;
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -1001,7 +992,12 @@ impl<'a> WalIngest<'a> {
// Copy relfilemap
let filemap = modification
.tline
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
.get_relmap_file(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.await?;
modification
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1015,7 +1011,7 @@ impl<'a> WalIngest<'a> {
let nblocks = modification
.tline
.get_rel_size(src_rel, req_lsn, true, ctx)
.get_rel_size(src_rel, Version::Modified(modification), true, ctx)
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
@@ -1033,7 +1029,13 @@ impl<'a> WalIngest<'a> {
let content = modification
.tline
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
.get_rel_page_at_lsn(
src_rel,
blknum,
Version::Modified(modification),
true,
ctx,
)
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
@@ -1104,7 +1106,7 @@ impl<'a> WalIngest<'a> {
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
fsm_physical_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
let nblocks = get_relsize(modification, rel, ctx).await?;
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1126,7 +1128,7 @@ impl<'a> WalIngest<'a> {
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
vm_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
let nblocks = get_relsize(modification, rel, ctx).await?;
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1199,10 +1201,9 @@ impl<'a> WalIngest<'a> {
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
let last_lsn = self.timeline.get_last_record_lsn();
if modification
.tline
.get_rel_exists(rel, last_lsn, true, ctx)
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
self.put_rel_drop(modification, rel, ctx).await?;
@@ -1256,10 +1257,9 @@ impl<'a> WalIngest<'a> {
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
let req_lsn = modification.tline.get_last_record_lsn();
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, req_lsn, ctx)
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1471,20 +1471,6 @@ impl<'a> WalIngest<'a> {
Ok(())
}
async fn get_relsize(
&mut self,
rel: RelTag,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
0
} else {
self.timeline.get_rel_size(rel, lsn, true, ctx).await?
};
Ok(nblocks)
}
async fn handle_rel_extend(
&mut self,
modification: &mut DatadirModification<'_>,
@@ -1496,7 +1482,6 @@ impl<'a> WalIngest<'a> {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
// Get current size and put rel creation if rel doesn't exist
//
@@ -1504,11 +1489,14 @@ impl<'a> WalIngest<'a> {
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
let old_nblocks = if let Some(nblocks) = modification
.tline
.get_cached_rel_size(&rel, modification.get_lsn())
{
nblocks
} else if !self
.timeline
.get_rel_exists(rel, last_lsn, true, ctx)
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1518,7 +1506,10 @@ impl<'a> WalIngest<'a> {
.context("Relation Error")?;
0
} else {
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
if new_nblocks > old_nblocks {
@@ -1571,10 +1562,9 @@ impl<'a> WalIngest<'a> {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = self.timeline.get_last_record_lsn();
let old_nblocks = if !self
.timeline
.get_slru_segment_exists(kind, segno, last_lsn, ctx)
let old_nblocks = if !modification
.tline
.get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1583,8 +1573,9 @@ impl<'a> WalIngest<'a> {
.await?;
0
} else {
self.timeline
.get_slru_segment_size(kind, segno, last_lsn, ctx)
modification
.tline
.get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
.await?
};
@@ -1607,11 +1598,32 @@ impl<'a> WalIngest<'a> {
}
}
async fn get_relsize(
modification: &DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
Ok(nblocks)
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::harness::*;
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
use postgres_ffi::RELSEG_SIZE;
@@ -1632,10 +1644,7 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
async fn init_walingest_test<'a>(
tline: &'a Timeline,
ctx: &RequestContext,
) -> Result<WalIngest<'a>> {
async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1680,29 +1689,29 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
3
);
@@ -1710,46 +1719,46 @@ mod tests {
// Check page contents at each LSN
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 2")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1765,19 +1774,19 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
@@ -1785,13 +1794,13 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
3
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1804,7 +1813,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
.await?,
0
);
@@ -1817,19 +1826,19 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
ZERO_PAGE
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
.await?,
TEST_IMG("foo blk 1")
);
@@ -1842,21 +1851,21 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
1501
);
for blk in 2..1500 {
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
ZERO_PAGE
);
}
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
TEST_IMG("foo blk 1500")
);
@@ -1883,13 +1892,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
1
);
@@ -1902,7 +1911,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
.await?,
false
);
@@ -1920,13 +1929,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.await?,
1
);
@@ -1959,24 +1968,24 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.await?,
relsize
);
@@ -1987,7 +1996,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2004,7 +2013,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
1
);
@@ -2014,7 +2023,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2023,7 +2032,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
relsize
);
@@ -2032,7 +2041,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2052,13 +2061,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
relsize
);
@@ -2068,7 +2077,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2101,7 +2110,9 @@ mod tests {
assert_current_logical_size(&tline, Lsn(lsn));
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE + 1
);
@@ -2113,7 +2124,9 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2126,7 +2139,9 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
RELSEG_SIZE - 1
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2142,7 +2157,9 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
size as BlockNumber
);
@@ -2177,21 +2194,25 @@ mod tests {
let pg_version = 15; // The test data was generated by pg15
let path = "test_data/sk_wal_segment_from_pgbench";
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
let source_initdb_path = format!("{path}/{INITDB_PATH}");
let startpoint = Lsn::from_hex("14AEC08").unwrap();
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
let (tenant, ctx) = harness.load().await;
let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
std::fs::create_dir_all(initdb_path.parent().unwrap())
.expect("creating test dir should work");
std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
// Bootstrap a real timeline. We can't use create_test_timeline because
// it doesn't create a real checkpoint, and Walingest::new tries to parse
// the garbage data.
//
// TODO use the initdb.tar.zst file stored with the test data to avoid
// problems with inconsistent initdb results after pg minor version bumps.
let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
.unwrap()
.load()
.await;
let tline = tenant
.bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
.bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
.await
.unwrap();
@@ -2217,7 +2238,7 @@ mod tests {
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
.await
.unwrap();
let mut modification = tline.begin_modification(endpoint);
let mut modification = tline.begin_modification(startpoint);
let mut decoded = DecodedWALRecord::default();
println!("decoding {} bytes", bytes.len() - xlogoff);
@@ -2231,6 +2252,7 @@ mod tests {
.await
.unwrap();
}
modification.commit(&ctx).await.unwrap();
}
let duration = started_at.elapsed();

View File

@@ -9,6 +9,7 @@ OBJS = \
libpagestore.o \
neon.o \
neon_utils.o \
neon_walreader.o \
pagestore_smgr.o \
relsize_cache.o \
walproposer.o \

View File

@@ -35,7 +35,8 @@
#define PageStoreTrace DEBUG5
#define RECONNECT_INTERVAL_USEC 1000000
#define MIN_RECONNECT_INTERVAL_USEC 100
#define MAX_RECONNECT_INTERVAL_USEC 1000000
bool connected = false;
PGconn *pageserver_conn = NULL;
@@ -133,6 +134,11 @@ pageserver_connect(int elevel)
const char *values[3];
int n;
static TimestampTz last_connect_time = 0;
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
TimestampTz now;
uint64_t us_since_last_connect;
Assert(!connected);
if (CheckConnstringUpdated())
@@ -140,6 +146,22 @@ pageserver_connect(int elevel)
ReloadConnstring();
}
now = GetCurrentTimestamp();
us_since_last_connect = now - last_connect_time;
if (us_since_last_connect < delay_us)
{
pg_usleep(delay_us - us_since_last_connect);
delay_us *= 2;
if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
delay_us = MAX_RECONNECT_INTERVAL_USEC;
last_connect_time = GetCurrentTimestamp();
}
else
{
delay_us = MIN_RECONNECT_INTERVAL_USEC;
last_connect_time = now;
}
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request)
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
pg_usleep(RECONNECT_INTERVAL_USEC);
}
n_reconnect_attempts = 0;
}

View File

@@ -0,0 +1,96 @@
/*
* Interface to set of libpq wrappers walproposer and neon_walreader need.
* Similar to libpqwalreceiver, but it has blocking connection establishment and
* pqexec which don't fit us. Implementation is at walproposer_pg.c.
*/
#ifndef ___LIBPQWALPROPOSER_H__
#define ___LIBPQWALPROPOSER_H__
/* Re-exported and modified ExecStatusType */
typedef enum
{
/* We received a single CopyBoth result */
WP_EXEC_SUCCESS_COPYBOTH,
/*
* Any success result other than a single CopyBoth was received. The
* specifics of the result were already logged, but it may be useful to
* provide an error message indicating which safekeeper messed up.
*
* Do not expect PQerrorMessage to be appropriately set.
*/
WP_EXEC_UNEXPECTED_SUCCESS,
/*
* No result available at this time. Wait until read-ready, then call
* again. Internally, this is returned when PQisBusy indicates that
* PQgetResult would block.
*/
WP_EXEC_NEEDS_INPUT,
/* Catch-all failure. Check PQerrorMessage. */
WP_EXEC_FAILED,
} WalProposerExecStatusType;
/* Possible return values from walprop_async_read */
typedef enum
{
/* The full read was successful. buf now points to the data */
PG_ASYNC_READ_SUCCESS,
/*
* The read is ongoing. Wait until the connection is read-ready, then try
* again.
*/
PG_ASYNC_READ_TRY_AGAIN,
/* Reading failed. Check PQerrorMessage(conn) */
PG_ASYNC_READ_FAIL,
} PGAsyncReadResult;
/* Possible return values from walprop_async_write */
typedef enum
{
/* The write fully completed */
PG_ASYNC_WRITE_SUCCESS,
/*
* The write started, but you'll need to call PQflush some more times to
* finish it off. We just tried, so it's best to wait until the connection
* is read- or write-ready to try again.
*
* If it becomes read-ready, call PQconsumeInput and flush again. If it
* becomes write-ready, just call PQflush.
*/
PG_ASYNC_WRITE_TRY_FLUSH,
/* Writing failed. Check PQerrorMessage(conn) */
PG_ASYNC_WRITE_FAIL,
} PGAsyncWriteResult;
/*
* This header is included by walproposer.h to define walproposer_api; if we're
* building walproposer without pg, ignore libpq part, leaving only interface
* types.
*/
#ifndef WALPROPOSER_LIB
#include "libpq-fe.h"
/*
* Sometimes working directly with underlying PGconn is simpler, export the
* whole thing for simplicity.
*/
typedef struct WalProposerConn
{
PGconn *pg_conn;
bool is_nonblocking; /* whether the connection is non-blocking */
char *recvbuf; /* last received CopyData message from
* walprop_async_read */
} WalProposerConn;
extern WalProposerConn *libpqwp_connect_start(char *conninfo);
extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
extern void libpqwp_disconnect(WalProposerConn *conn);
#endif /* WALPROPOSER_LIB */
#endif /* ___LIBPQWALPROPOSER_H__ */

742
pgxn/neon/neon_walreader.c Normal file
View File

@@ -0,0 +1,742 @@
/*
* Like WALRead, but when WAL segment doesn't exist locally instead of throwing
* ERROR asynchronously tries to fetch it from the most advanced safekeeper.
*
* We can't use libpqwalreceiver as it blocks during connection establishment
* (and waiting for PQExec result), so use libpqwalproposer instead.
*
* TODO: keepalives are currently never sent, so the other side can close the
* connection prematurely.
*
* TODO: close conn if reading takes too long to prevent stuck connections.
*/
#include "postgres.h"
#include <sys/stat.h>
#include <unistd.h>
#include "access/xlog_internal.h"
#include "access/xlogdefs.h"
#include "access/xlogreader.h"
#include "libpq/pqformat.h"
#include "storage/fd.h"
#include "utils/wait_event.h"
#include "libpq-fe.h"
#include "neon_walreader.h"
#include "walproposer.h"
#define NEON_WALREADER_ERR_MSG_LEN 512
/*
* Can be called where NeonWALReader *state is available in the context, adds log_prefix.
*/
#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
static void NeonWALReaderResetRemote(NeonWALReader *state);
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
static void neon_wal_segment_close(NeonWALReader *state);
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
TimeLineID tli);
/*
* State of connection to donor safekeeper.
*/
typedef enum
{
/* no remote connection */
RS_NONE,
/* doing PQconnectPoll, need readable socket */
RS_CONNECTING_READ,
/* doing PQconnectPoll, need writable socket */
RS_CONNECTING_WRITE,
/* Waiting for START_REPLICATION result */
RS_WAIT_EXEC_RESULT,
/* replication stream established */
RS_ESTABLISHED,
} NeonWALReaderRemoteState;
struct NeonWALReader
{
/*
* LSN before which we assume WAL is not available locally. Exists because
* though first segment after startup always exists, part before
* basebackup LSN is filled with zeros.
*/
XLogRecPtr available_lsn;
WALSegmentContext segcxt;
WALOpenSegment seg;
int wre_errno;
/* Explains failure to read, static for simplicity. */
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
/*
* Saved info about request in progress, used to check validity of
* arguments after resume and remember how far we accomplished it. req_lsn
* is 0 if there is no request in progress.
*/
XLogRecPtr req_lsn;
Size req_len;
Size req_progress;
WalProposer *wp; /* we learn donor through walproposer */
char donor_name[64]; /* saved donor safekeeper name for logging */
/* state of connection to safekeeper */
NeonWALReaderRemoteState rem_state;
WalProposerConn *wp_conn;
/*
* position in wp_conn recvbuf from which we'll copy WAL next time, or
* NULL if there is no unprocessed message
*/
char *wal_ptr;
Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */
/*
* LSN of wal_ptr position according to walsender to cross check against
* read request
*/
XLogRecPtr rem_lsn;
/* prepended to lines logged by neon_walreader, if provided */
char log_prefix[64];
};
/* palloc and initialize NeonWALReader */
NeonWALReader *
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
{
NeonWALReader *reader;
reader = (NeonWALReader *)
palloc_extended(sizeof(NeonWALReader),
MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
if (!reader)
return NULL;
reader->available_lsn = available_lsn;
reader->seg.ws_file = -1;
reader->seg.ws_segno = 0;
reader->seg.ws_tli = 0;
reader->segcxt.ws_segsize = wal_segment_size;
reader->wp = wp;
reader->rem_state = RS_NONE;
if (log_prefix)
strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
return reader;
}
void
NeonWALReaderFree(NeonWALReader *state)
{
if (state->seg.ws_file != -1)
neon_wal_segment_close(state);
if (state->wp_conn)
libpqwp_disconnect(state->wp_conn);
pfree(state);
}
/*
* Like vanilla WALRead, but if requested position is before available_lsn or
* WAL segment doesn't exist on disk, it tries to fetch needed segment from the
* advanced safekeeper.
*
* Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
* fetched from timeline 'tli'.
*
* Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
* occurs, in which case 'err' has the desciption. Error always closes remote
* connection, if there was any, so socket subscription should be removed.
*
* NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
* NeonWALReaderSocket and call NeonWALRead again with exactly the same
* arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
* docs during connection establishment (before first successful read) socket
* underneath might change.
*
* Also, eventually walreader should switch from remote to local read; caller
* should remove subscription to socket then by checking NeonWALReaderEvents
* after successful read (otherwise next read might reopen the connection with
* different socket).
*
* Reading not monotonically is not supported and will result in error.
*
* Caller should be sure that WAL up to requested LSN exists, otherwise
* NEON_WALREAD_WOULDBLOCK might be always returned.
*/
NeonWALReadResult
NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
{
/*
* If requested data is before known available basebackup lsn or there is
* already active remote state, do remote read.
*/
if (startptr < state->available_lsn || state->rem_state != RS_NONE)
{
return NeonWALReadRemote(state, buf, startptr, count, tli);
}
if (NeonWALReadLocal(state, buf, startptr, count, tli))
{
return NEON_WALREAD_SUCCESS;
}
else if (state->wre_errno == ENOENT)
{
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
LSN_FORMAT_ARGS(startptr));
return NeonWALReadRemote(state, buf, startptr, count, tli);
}
else
{
return NEON_WALREAD_ERROR;
}
}
/* Do the read from remote safekeeper. */
static NeonWALReadResult
NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
{
if (state->rem_state == RS_NONE)
{
XLogRecPtr donor_lsn;
/* no connection yet; start one */
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
if (donor == NULL)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to establish remote connection to fetch WAL: no donor available");
return NEON_WALREAD_ERROR;
}
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
state->wp_conn = libpqwp_connect_start(donor->conninfo);
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to connect to %s to fetch WAL: immediately failed with %s",
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
/* we'll poll immediately */
state->rem_state = RS_CONNECTING_READ;
}
if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
{
switch (PQconnectPoll(state->wp_conn->pg_conn))
{
case PGRES_POLLING_FAILED:
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to connect to %s to fetch WAL: poll error: %s",
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
case PGRES_POLLING_READING:
state->rem_state = RS_CONNECTING_READ;
return NEON_WALREAD_WOULDBLOCK;
case PGRES_POLLING_WRITING:
state->rem_state = RS_CONNECTING_WRITE;
return NEON_WALREAD_WOULDBLOCK;
case PGRES_POLLING_OK:
{
/* connection successfully established */
char start_repl_query[128];
snprintf(start_repl_query, sizeof(start_repl_query),
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
state->donor_name, start_repl_query);
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
{
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to send %s query to %s: %s",
start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
state->rem_state = RS_WAIT_EXEC_RESULT;
break;
}
default: /* there is unused PGRES_POLLING_ACTIVE */
Assert(false);
return NEON_WALREAD_ERROR; /* keep the compiler quiet */
}
}
if (state->rem_state == RS_WAIT_EXEC_RESULT)
{
switch (libpqwp_get_query_result(state->wp_conn))
{
case WP_EXEC_SUCCESS_COPYBOTH:
state->rem_state = RS_ESTABLISHED;
break;
case WP_EXEC_NEEDS_INPUT:
return NEON_WALREAD_WOULDBLOCK;
case WP_EXEC_FAILED:
snprintf(state->err_msg, sizeof(state->err_msg),
"get START_REPLICATION result from %s failed: %s",
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
default: /* can't happen */
snprintf(state->err_msg, sizeof(state->err_msg),
"get START_REPLICATION result from %s: unexpected result",
state->donor_name);
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
}
Assert(state->rem_state == RS_ESTABLISHED);
/*
* If we had the request before, verify args are the same and advance the
* result ptr according to the progress; otherwise register the request.
*/
if (state->req_lsn != InvalidXLogRecPtr)
{
if (state->req_lsn != startptr || state->req_len != count)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"args changed during request, was %X/%X %zu, now %X/%X %zu",
LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
LSN_FORMAT_ARGS(startptr),
count,
state->req_progress);
buf += state->req_progress;
}
else
{
state->req_lsn = startptr;
state->req_len = count;
state->req_progress = 0;
nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
LSN_FORMAT_ARGS(startptr),
count);
}
while (true)
{
Size to_copy;
/*
* If we have no ready data, receive new message.
*/
if (state->wal_rem_len == 0 &&
/*
* check for the sake of 0 length reads; walproposer does these for
* heartbeats, though generally they shouldn't hit remote source.
*/
state->req_len - state->req_progress > 0)
{
NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
if (read_msg_res != NEON_WALREAD_SUCCESS)
return read_msg_res;
}
if (state->req_lsn + state->req_progress != state->rem_lsn)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
LSN_FORMAT_ARGS(state->rem_lsn),
LSN_FORMAT_ARGS(state->req_lsn),
state->req_len);
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
/* We can copy min of (available, requested) bytes. */
to_copy =
Min(state->req_len - state->req_progress, state->wal_rem_len);
memcpy(buf, state->wal_ptr, to_copy);
state->wal_ptr += to_copy;
state->wal_rem_len -= to_copy;
state->rem_lsn += to_copy;
if (state->wal_rem_len == 0)
state->wal_ptr = NULL; /* freed by libpqwalproposer */
buf += to_copy;
state->req_progress += to_copy;
if (state->req_progress == state->req_len)
{
XLogSegNo next_segno;
XLogSegNo req_segno;
XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
/*
* Request completed. If there is a chance of serving next one
* locally, close the connection.
*/
if (state->req_lsn < state->available_lsn &&
state->rem_lsn >= state->available_lsn)
{
nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
NeonWALReaderResetRemote(state);
}
else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
{
nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
LSN_FORMAT_ARGS(state->rem_lsn));
NeonWALReaderResetRemote(state);
}
state->req_lsn = InvalidXLogRecPtr;
state->req_len = 0;
state->req_progress = 0;
return NEON_WALREAD_SUCCESS;
}
}
}
/*
* Read one WAL message from the stream, sets state->wal_ptr in case of success.
* Resets remote state in case of failure.
*/
static NeonWALReadResult
NeonWALReaderReadMsg(NeonWALReader *state)
{
while (true) /* loop until we get 'w' */
{
char *copydata_ptr;
int copydata_size;
StringInfoData s;
char msg_type;
int hdrlen;
Assert(state->rem_state == RS_ESTABLISHED);
Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
switch (libpqwp_async_read(state->wp_conn,
&copydata_ptr,
&copydata_size))
{
case PG_ASYNC_READ_SUCCESS:
break;
case PG_ASYNC_READ_TRY_AGAIN:
return NEON_WALREAD_WOULDBLOCK;
case PG_ASYNC_READ_FAIL:
snprintf(state->err_msg,
sizeof(state->err_msg),
"req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
LSN_FORMAT_ARGS(state->req_lsn),
state->req_len,
state->req_progress,
PQerrorMessage(state->wp_conn->pg_conn));
goto err;
}
/* put data on StringInfo to parse */
s.data = copydata_ptr;
s.len = copydata_size;
s.cursor = 0;
s.maxlen = -1;
if (copydata_size == 0)
{
snprintf(state->err_msg,
sizeof(state->err_msg),
"zero length copydata received");
goto err;
}
msg_type = pq_getmsgbyte(&s);
switch (msg_type)
{
case 'w':
{
XLogRecPtr start_lsn;
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
if (s.len - s.cursor < hdrlen)
{
snprintf(state->err_msg,
sizeof(state->err_msg),
"invalid WAL message received from primary");
goto err;
}
start_lsn = pq_getmsgint64(&s);
pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */
pq_getmsgint64(&s); /* TimestampTz send_time */
state->rem_lsn = start_lsn;
state->wal_rem_len = (Size) (s.len - s.cursor);
state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
return NEON_WALREAD_SUCCESS;
}
case 'k':
{
XLogRecPtr end_lsn;
bool reply_requested;
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
if (s.len - s.cursor < hdrlen)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"invalid keepalive message received from primary");
goto err;
}
end_lsn = pq_getmsgint64(&s);
pq_getmsgint64(&s); /* TimestampTz timestamp; */
reply_requested = pq_getmsgbyte(&s);
nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
LSN_FORMAT_ARGS(end_lsn),
reply_requested);
if (end_lsn < state->req_lsn + state->req_len)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
goto err;
}
continue;
}
default:
nwr_log(WARNING, "invalid replication message type %d", msg_type);
continue;
}
}
err:
NeonWALReaderResetRemote(state);
return NEON_WALREAD_ERROR;
}
/* reset remote connection and request in progress */
static void
NeonWALReaderResetRemote(NeonWALReader *state)
{
state->req_lsn = InvalidXLogRecPtr;
state->req_len = 0;
state->req_progress = 0;
state->rem_state = RS_NONE;
if (state->wp_conn)
{
libpqwp_disconnect(state->wp_conn);
state->wp_conn = NULL;
}
state->donor_name[0] = '\0';
state->wal_ptr = NULL;
state->wal_rem_len = 0;
state->rem_lsn = InvalidXLogRecPtr;
}
/*
* Return socket of connection to remote source. Must be called only when
* connection exists (NeonWALReaderEvents returns non zero).
*/
pgsocket
NeonWALReaderSocket(NeonWALReader *state)
{
if (!state->wp_conn)
nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
return PQsocket(state->wp_conn->pg_conn);
}
/*
* Whether remote connection is established. Once this is done, until successful
* local read or error socket is stable and user can update socket events
* instead of readding it each time.
*/
bool
NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
{
return state->rem_state == RS_ESTABLISHED;
}
/*
* Returns events user should wait on connection socket or 0 if remote
* connection is not active.
*/
extern uint32
NeonWALReaderEvents(NeonWALReader *state)
{
switch (state->rem_state)
{
case RS_NONE:
return 0;
case RS_CONNECTING_READ:
return WL_SOCKET_READABLE;
case RS_CONNECTING_WRITE:
return WL_SOCKET_WRITEABLE;
case RS_WAIT_EXEC_RESULT:
case RS_ESTABLISHED:
return WL_SOCKET_READABLE;
default:
Assert(false);
return 0; /* make compiler happy */
}
}
static bool
NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
{
char *p;
XLogRecPtr recptr;
Size nbytes;
p = buf;
recptr = startptr;
nbytes = count;
while (nbytes > 0)
{
uint32 startoff;
int segbytes;
int readbytes;
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
/*
* If the data we want is not in a segment we have open, close what we
* have (if anything) and open the next one, using the caller's
* provided openSegment callback.
*/
if (state->seg.ws_file < 0 ||
!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
tli != state->seg.ws_tli)
{
XLogSegNo nextSegNo;
neon_wal_segment_close(state);
XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
if (!neon_wal_segment_open(state, nextSegNo, &tli))
{
char fname[MAXFNAMELEN];
state->wre_errno = errno;
XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
return false;
}
/* This shouldn't happen -- indicates a bug in segment_open */
Assert(state->seg.ws_file >= 0);
/* Update the current segment info. */
state->seg.ws_tli = tli;
state->seg.ws_segno = nextSegNo;
}
/* How many bytes are within this segment? */
if (nbytes > (state->segcxt.ws_segsize - startoff))
segbytes = state->segcxt.ws_segsize - startoff;
else
segbytes = nbytes;
#ifndef FRONTEND
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
#endif
/* Reset errno first; eases reporting non-errno-affecting errors */
errno = 0;
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
#ifndef FRONTEND
pgstat_report_wait_end();
#endif
if (readbytes <= 0)
{
char fname[MAXFNAMELEN];
XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
if (readbytes < 0)
{
state->wre_errno = errno;
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
fname, startoff, strerror(state->wre_errno));
}
else
{
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
fname, startoff);
}
return false;
}
/* Update state for read */
recptr += readbytes;
nbytes -= readbytes;
p += readbytes;
}
return true;
}
/*
* Copy of vanilla wal_segment_open, but returns false in case of error instead
* of ERROR, with errno set.
*
* XLogReaderRoutine->segment_open callback for local pg_wal files
*/
static bool
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
TimeLineID tli = *tli_p;
char path[MAXPGPATH];
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
nwr_log(DEBUG5, "opening %s", path);
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
if (state->seg.ws_file >= 0)
return true;
return false;
}
static bool
is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
{
struct stat stat_buffer;
char path[MAXPGPATH];
XLogFilePath(path, tli, segno, segsize);
return stat(path, &stat_buffer) == 0;
}
/* copy of vanilla wal_segment_close with NeonWALReader */
static void
neon_wal_segment_close(NeonWALReader *state)
{
if (state->seg.ws_file >= 0)
{
close(state->seg.ws_file);
/* need to check errno? */
state->seg.ws_file = -1;
}
}
char *
NeonWALReaderErrMsg(NeonWALReader *state)
{
return state->err_msg;
}

View File

@@ -0,0 +1,30 @@
#ifndef __NEON_WALREADER_H__
#define __NEON_WALREADER_H__
#include "access/xlogdefs.h"
/* forward declare so we don't have to expose the struct to the public */
struct NeonWALReader;
typedef struct NeonWALReader NeonWALReader;
/* avoid including walproposer.h as it includes us */
struct WalProposer;
typedef struct WalProposer WalProposer;
/* NeonWALRead return value */
typedef enum
{
NEON_WALREAD_SUCCESS,
NEON_WALREAD_WOULDBLOCK,
NEON_WALREAD_ERROR,
} NeonWALReadResult;
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
extern void NeonWALReaderFree(NeonWALReader *state);
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
#endif /* __NEON_WALREADER_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,9 @@
#include "replication/walreceiver.h"
#include "utils/uuid.h"
#include "libpqwalproposer.h"
#include "neon_walreader.h"
#define SK_MAGIC 0xCafeCeefu
#define SK_PROTOCOL_VERSION 2
@@ -20,43 +23,9 @@
*/
#define WL_NO_EVENTS 0
struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */
struct WalProposerConn; /* Defined in libpqwalproposer.h */
typedef struct WalProposerConn WalProposerConn;
/* Possible return values from ReadPGAsync */
typedef enum
{
/* The full read was successful. buf now points to the data */
PG_ASYNC_READ_SUCCESS,
/*
* The read is ongoing. Wait until the connection is read-ready, then try
* again.
*/
PG_ASYNC_READ_TRY_AGAIN,
/* Reading failed. Check PQerrorMessage(conn) */
PG_ASYNC_READ_FAIL,
} PGAsyncReadResult;
/* Possible return values from WritePGAsync */
typedef enum
{
/* The write fully completed */
PG_ASYNC_WRITE_SUCCESS,
/*
* The write started, but you'll need to call PQflush some more times to
* finish it off. We just tried, so it's best to wait until the connection
* is read- or write-ready to try again.
*
* If it becomes read-ready, call PQconsumeInput and flush again. If it
* becomes write-ready, just call PQflush.
*/
PG_ASYNC_WRITE_TRY_FLUSH,
/* Writing failed. Check PQerrorMessage(conn) */
PG_ASYNC_WRITE_FAIL,
} PGAsyncWriteResult;
/*
* WAL safekeeper state, which is used to wait for some event.
*
@@ -133,6 +102,40 @@ typedef enum
SS_ACTIVE,
} SafekeeperState;
/*
* Sending WAL substates of SS_ACTIVE.
*/
typedef enum
{
/*
* We are ready to send more WAL, waiting for latch set to learn about
* more WAL becoming available (or just a timeout to send heartbeat).
*/
SS_ACTIVE_SEND,
/*
* Polling neon_walreader to receive chunk of WAL (probably remotely) to
* send to this safekeeper.
*
* Note: socket management is done completely inside walproposer_pg for
* simplicity, and thus simulation doesn't test it. Which is fine as
* simulation is mainly aimed at consensus checks, not waiteventset
* management.
*
* Also, while in this state we don't touch safekeeper socket, so in
* theory it might close connection as inactive. This can be addressed if
* needed; however, while fetching WAL we should regularly send it, so the
* problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
* walreader socket), but similarly shouldn't be a problem.
*/
SS_ACTIVE_READ_WAL,
/*
* Waiting for write readiness to flush the socket.
*/
SS_ACTIVE_FLUSH,
} SafekeeperActiveState;
/* Consensus logical timestamp. */
typedef uint64 term_t;
@@ -341,12 +344,11 @@ typedef struct Safekeeper
*/
XLogRecPtr startStreamingAt;
bool flushWrite; /* set to true if we need to call AsyncFlush,*
* to flush pending messages */
XLogRecPtr streamingAt; /* current streaming position */
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
SafekeeperState state; /* safekeeper state machine state */
SafekeeperActiveState active_state;
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
AcceptorGreeting greetResponse; /* acceptor greeting */
VoteResponse voteResponse; /* the vote */
@@ -367,12 +369,27 @@ typedef struct Safekeeper
/*
* WAL reader, allocated for each safekeeper.
*/
XLogReaderState *xlogreader;
NeonWALReader *xlogreader;
/*
* Position in wait event set. Equal to -1 if no event
*/
int eventPos;
/*
* Neon WAL reader position in wait event set, or -1 if no socket. Note
* that event must be removed not only on error/failure, but also on
* successful *local* read, as next read might again be remote, but with
* different socket.
*/
int nwrEventPos;
/*
* Per libpq docs, during connection establishment socket might change,
* remember here if it is stable to avoid readding to the event set if
* possible. Must be reset whenever nwr event is deleted.
*/
bool nwrConnEstablished;
#endif
@@ -401,31 +418,6 @@ typedef enum
*/
} WalProposerConnectPollStatusType;
/* Re-exported and modified ExecStatusType */
typedef enum
{
/* We received a single CopyBoth result */
WP_EXEC_SUCCESS_COPYBOTH,
/*
* Any success result other than a single CopyBoth was received. The
* specifics of the result were already logged, but it may be useful to
* provide an error message indicating which safekeeper messed up.
*
* Do not expect PQerrorMessage to be appropriately set.
*/
WP_EXEC_UNEXPECTED_SUCCESS,
/*
* No result available at this time. Wait until read-ready, then call
* again. Internally, this is returned when PQisBusy indicates that
* PQgetResult would block.
*/
WP_EXEC_NEEDS_INPUT,
/* Catch-all failure. Check PQerrorMessage. */
WP_EXEC_FAILED,
} WalProposerExecStatusType;
/* Re-exported ConnStatusType */
typedef enum
{
@@ -486,7 +478,7 @@ typedef struct walproposer_api
/* Flush buffer to the network, aka PQflush. */
int (*conn_flush) (Safekeeper *sk);
/* Close the connection, aka PQfinish. */
/* Reset sk state: close pq connection, deallocate xlogreader. */
void (*conn_finish) (Safekeeper *sk);
/*
@@ -503,17 +495,20 @@ typedef struct walproposer_api
/* Blocking CopyData write, aka PQputCopyData + PQflush. */
bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
/* Download WAL from startpos to endpos and make it available locally. */
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
/* Read WAL from disk to buf. */
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
/*
* Download WAL before basebackup for logical walsenders from sk, if
* needed
*/
bool (*recovery_download) (WalProposer *wp, Safekeeper *sk);
/* Allocate WAL reader. */
void (*wal_reader_allocate) (Safekeeper *sk);
/* Deallocate event set. */
void (*free_event_set) (WalProposer *wp);
/* Read WAL from disk to buf. */
NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
/* Returns events to be awaited on WAL reader, if any. */
uint32 (*wal_reader_events) (Safekeeper *sk);
/* Initialize event set. */
void (*init_event_set) (WalProposer *wp);
@@ -521,9 +516,15 @@ typedef struct walproposer_api
/* Update events for an existing safekeeper connection. */
void (*update_event_set) (Safekeeper *sk, uint32 events);
/* Configure wait event set for yield in SS_ACTIVE. */
void (*active_state_update_event_set) (Safekeeper *sk);
/* Add a new safekeeper connection to the event set. */
void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
/* Remove safekeeper connection from event set */
void (*rm_safekeeper_event_set) (Safekeeper *sk);
/*
* Wait until some event happens: - timeout is reached - socket event for
* safekeeper connection - new WAL is available
@@ -556,26 +557,12 @@ typedef struct walproposer_api
*/
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
/*
* Called on peer_horizon_lsn updates. Used to advance replication slot
* and to free up disk space by deleting unnecessary WAL.
*/
void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
/*
* Write a log message to the internal log processor. This is used only
* when walproposer is compiled as a library. Otherwise, all logging is
* handled by elog().
*/
void (*log_internal) (WalProposer *wp, int level, const char *line);
/*
* Called right after the proposer was elected, but before it started
* recovery and sent ProposerElected message to the safekeepers.
*
* Used by logical replication to update truncateLsn.
*/
void (*after_election) (WalProposer *wp);
} walproposer_api;
/*
@@ -709,15 +696,34 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
extern void WalProposerPoll(WalProposer *wp);
extern void WalProposerFree(WalProposer *wp);
/*
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
* recreate set from scratch, hence the export.
*/
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
#define WPEVENT 1337 /* special log level for walproposer internal
* events */
#define WP_LOG_PREFIX "[WP] "
/*
* wp_log is used in pure wp code (walproposer.c), allowing API callback to
* catch logging.
*/
#ifdef WALPROPOSER_LIB
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
#else
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
#endif
/*
* And wpg_log is used all other (postgres specific) walproposer code, just
* adding prefix.
*/
#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
#endif /* __NEON_WALPROPOSER_H__ */

View File

@@ -12,6 +12,7 @@
#include <unistd.h>
#include <sys/stat.h>
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xlogdefs.h"
#include "access/xlogutils.h"
#include "access/xloginsert.h"
@@ -43,14 +44,19 @@
#include "utils/ps_status.h"
#include "utils/timestamp.h"
#include "neon.h"
#include "walproposer.h"
#include "libpq-fe.h"
#include "libpqwalproposer.h"
#include "neon.h"
#include "neon_walreader.h"
#include "walproposer.h"
#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender*
* message header */
#define MB ((XLogRecPtr)1024 * 1024)
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
char *wal_acceptors_list = "";
@@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
static void XLogWalPropClose(XLogRecPtr recptr);
static void add_nwr_event_set(Safekeeper *sk, uint32 events);
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
static void
init_walprop_config(bool syncSafekeepers)
{
@@ -214,7 +226,6 @@ backpressure_lag_impl(void)
XLogRecPtr myFlushLsn = GetFlushRecPtr();
#endif
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
#define MB ((XLogRecPtr)1024 * 1024)
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
LSN_FORMAT_ARGS(myFlushLsn),
@@ -413,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
{
StartReplicationCmd cmd;
elog(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
LSN_FORMAT_ARGS(startpos));
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
cmd.timeline = wp->greetRequest.timeline;
cmd.startpoint = startpos;
@@ -538,17 +549,9 @@ walprop_pg_load_libpqwalreceiver(void)
{
load_file("libpqwalreceiver", false);
if (WalReceiverFunctions == NULL)
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
}
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
struct WalProposerConn
{
PGconn *pg_conn;
bool is_nonblocking; /* whether the connection is non-blocking */
char *recvbuf; /* last received data from walprop_async_read */
};
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk)
}
}
static void
walprop_connect_start(Safekeeper *sk)
WalProposerConn *
libpqwp_connect_start(char *conninfo)
{
PGconn *pg_conn;
WalProposerConn *conn;
const char *keywords[3];
const char *values[3];
int n;
char *password = neon_auth_token;
Assert(sk->conn == NULL);
/*
* Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk)
n++;
}
keywords[n] = "dbname";
values[n] = sk->conninfo;
values[n] = conninfo;
n++;
keywords[n] = NULL;
values[n] = NULL;
@@ -626,7 +630,7 @@ walprop_connect_start(Safekeeper *sk)
* PGconn structure"
*/
if (!pg_conn)
elog(FATAL, "failed to allocate new PGconn object");
wpg_log(FATAL, "failed to allocate new PGconn object");
/*
* And in theory this allocation can fail as well, but it's incredibly
@@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk)
* palloc will exit on failure though, so there's not much we could do if
* it *did* fail.
*/
sk->conn = palloc(sizeof(WalProposerConn));
sk->conn->pg_conn = pg_conn;
sk->conn->is_nonblocking = false; /* connections always start in
* blocking mode */
sk->conn->recvbuf = NULL;
conn = palloc(sizeof(WalProposerConn));
conn->pg_conn = pg_conn;
conn->is_nonblocking = false; /* connections always start in blocking
* mode */
conn->recvbuf = NULL;
return conn;
}
static void
walprop_connect_start(Safekeeper *sk)
{
Assert(sk->conn == NULL);
sk->conn = libpqwp_connect_start(sk->conninfo);
}
static WalProposerConnectPollStatusType
@@ -667,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
* unused. We'll expect it's never returned.
*/
case PGRES_POLLING_ACTIVE:
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
/*
* This return is never actually reached, but it's here to make
@@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk)
return return_val;
}
static bool
walprop_send_query(Safekeeper *sk, char *query)
extern bool
libpqwp_send_query(WalProposerConn *conn, char *query)
{
/*
* We need to be in blocking mode for sending the query to run without
* requiring a call to PQflush
*/
if (!ensure_nonblocking_status(sk->conn, false))
if (!ensure_nonblocking_status(conn, false))
return false;
/* PQsendQuery returns 1 on success, 0 on failure */
if (!PQsendQuery(sk->conn->pg_conn, query))
if (!PQsendQuery(conn->pg_conn, query))
return false;
return true;
}
static WalProposerExecStatusType
walprop_get_query_result(Safekeeper *sk)
static bool
walprop_send_query(Safekeeper *sk, char *query)
{
return libpqwp_send_query(sk->conn, query);
}
WalProposerExecStatusType
libpqwp_get_query_result(WalProposerConn *conn)
{
PGresult *result;
WalProposerExecStatusType return_val;
@@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk)
char *unexpected_success = NULL;
/* Consume any input that we might be missing */
if (!PQconsumeInput(sk->conn->pg_conn))
if (!PQconsumeInput(conn->pg_conn))
return WP_EXEC_FAILED;
if (PQisBusy(sk->conn->pg_conn))
if (PQisBusy(conn->pg_conn))
return WP_EXEC_NEEDS_INPUT;
result = PQgetResult(sk->conn->pg_conn);
result = PQgetResult(conn->pg_conn);
/*
* PQgetResult returns NULL only if getting the result was successful &
@@ -725,7 +745,7 @@ walprop_get_query_result(Safekeeper *sk)
*/
if (!result)
{
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
return WP_EXEC_UNEXPECTED_SUCCESS;
}
@@ -773,11 +793,17 @@ walprop_get_query_result(Safekeeper *sk)
}
if (unexpected_success)
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
return return_val;
}
static WalProposerExecStatusType
walprop_get_query_result(Safekeeper *sk)
{
return libpqwp_get_query_result(sk->conn);
}
static pgsocket
walprop_socket(Safekeeper *sk)
{
@@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk)
return (PQflush(sk->conn->pg_conn));
}
static void
walprop_finish(Safekeeper *sk)
/* Like libpqrcv_receive. *buf is valid until the next call. */
PGAsyncReadResult
libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
{
if (!sk->conn)
return;
int rawlen;
if (sk->conn->recvbuf != NULL)
PQfreemem(sk->conn->recvbuf);
PQfinish(sk->conn->pg_conn);
pfree(sk->conn);
sk->conn = NULL;
}
/*
* Receive a message from the safekeeper.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*/
static PGAsyncReadResult
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
{
int result;
if (sk->conn->recvbuf != NULL)
if (conn->recvbuf != NULL)
{
PQfreemem(sk->conn->recvbuf);
sk->conn->recvbuf = NULL;
PQfreemem(conn->recvbuf);
conn->recvbuf = NULL;
}
/* Call PQconsumeInput so that we have the data we need */
if (!PQconsumeInput(sk->conn->pg_conn))
/* Try to receive a CopyData message */
rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
if (rawlen == 0)
{
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
/* Try consuming some data. */
if (!PQconsumeInput(conn->pg_conn))
{
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
}
/* Now that we've consumed some input, try again */
rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
}
/*
@@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
* sometimes be triggered by the server returning an ErrorResponse (which
* also happens to have the effect that the copy is done).
*/
switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
switch (rawlen)
{
case 0:
*amount = 0;
@@ -854,10 +869,10 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
* We can check PQgetResult to make sure that the server
* failed; it'll always result in PGRES_FATAL_ERROR
*/
ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
if (status != PGRES_FATAL_ERROR)
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
/*
* If there was actually an error, it'll be properly reported
@@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
return PG_ASYNC_READ_FAIL;
default:
/* Positive values indicate the size of the returned result */
*amount = result;
*buf = sk->conn->recvbuf;
*amount = rawlen;
*buf = conn->recvbuf;
return PG_ASYNC_READ_SUCCESS;
}
}
/*
* Receive a message from the safekeeper.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*/
static PGAsyncReadResult
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
{
return libpqwp_async_read(sk->conn, buf, amount);
}
static PGAsyncWriteResult
walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
{
@@ -910,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQputCopyData", result);
wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
}
/*
@@ -931,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQflush", result);
wpg_log(FATAL, "invalid return %d from PQflush", result);
}
}
@@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
return true;
}
void
libpqwp_disconnect(WalProposerConn *conn)
{
if (conn->recvbuf != NULL)
PQfreemem(conn->recvbuf);
PQfinish(conn->pg_conn);
pfree(conn);
}
static void
walprop_finish(Safekeeper *sk)
{
if (sk->conn)
{
libpqwp_disconnect(sk->conn);
sk->conn = NULL;
}
/* free xlogreader */
if (sk->xlogreader)
{
NeonWALReaderFree(sk->xlogreader);
sk->xlogreader = NULL;
}
rm_safekeeper_event_set(sk, false);
}
/*
* Subscribe for new WAL and stream it in the loop to safekeepers.
*
@@ -1165,16 +1219,25 @@ XLogBroadcastWalProposer(WalProposer *wp)
}
}
/*
* Receive WAL from most advanced safekeeper
*/
/* Download WAL before basebackup for logical walsenders from sk, if needed */
static bool
WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
{
char *err;
WalReceiverConn *wrconn;
WalRcvStreamOptions options;
char conninfo[MAXCONNINFO];
TimeLineID timeline;
XLogRecPtr startpos;
XLogRecPtr endpos;
uint64 download_range_mb;
startpos = GetLogRepRestartLSN(wp);
if (startpos == InvalidXLogRecPtr)
return true; /* recovery not needed */
endpos = wp->propEpochStartLsn;
timeline = wp->greetRequest.timeline;
if (!neon_auth_token)
{
@@ -1186,7 +1249,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
if (written > MAXCONNINFO || written < 0)
elog(FATAL, "could not append password to the safekeeper connection string");
wpg_log(FATAL, "could not append password to the safekeeper connection string");
}
#if PG_MAJORVERSION_NUM < 16
@@ -1203,11 +1266,11 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
err)));
return false;
}
elog(LOG,
"start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
wpg_log(LOG,
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
options.logical = false;
options.startpoint = startpos;
@@ -1400,28 +1463,54 @@ XLogWalPropClose(XLogRecPtr recptr)
walpropFile = -1;
}
static void
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
{
WALReadError errinfo;
if (!WALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id(),
&errinfo))
{
WALReadRaiseError(&errinfo);
}
}
static void
walprop_pg_wal_reader_allocate(Safekeeper *sk)
{
sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
char log_prefix[64];
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
Assert(!sk->xlogreader);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
if (sk->xlogreader == NULL)
elog(FATAL, "Failed to allocate xlog reader");
wpg_log(FATAL, "failed to allocate xlog reader");
}
static NeonWALReadResult
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
{
NeonWALReadResult res;
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
if (res == NEON_WALREAD_SUCCESS)
{
/*
* If we have the socket subscribed, but walreader doesn't need any
* events, it must mean that remote connection just closed hoping to
* do next read locally. Remove the socket then. It is important to do
* as otherwise next read might open another connection and we won't
* be able to distinguish whether we have correct socket added in wait
* event set.
*/
if (NeonWALReaderEvents(sk->xlogreader) == 0)
rm_safekeeper_event_set(sk, false);
}
else if (res == NEON_WALREAD_ERROR)
{
*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
}
return res;
}
static uint32
walprop_pg_wal_reader_events(Safekeeper *sk)
{
return NeonWALReaderEvents(sk->xlogreader);
}
static WaitEventSet *waitEvents;
@@ -1438,6 +1527,8 @@ walprop_pg_free_event_set(WalProposer *wp)
for (int i = 0; i < wp->n_safekeepers; i++)
{
wp->safekeeper[i].eventPos = -1;
wp->safekeeper[i].nwrEventPos = -1;
wp->safekeeper[i].nwrConnEstablished = false;
}
}
@@ -1445,13 +1536,39 @@ static void
walprop_pg_init_event_set(WalProposer *wp)
{
if (waitEvents)
elog(FATAL, "double-initialization of event set");
wpg_log(FATAL, "double-initialization of event set");
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
/* for each sk, we have socket plus potentially socket for neon walreader */
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
for (int i = 0; i < wp->n_safekeepers; i++)
{
wp->safekeeper[i].eventPos = -1;
wp->safekeeper[i].nwrEventPos = -1;
wp->safekeeper[i].nwrConnEstablished = false;
}
}
/* add safekeeper socket to wait event set */
static void
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
{
Assert(sk->eventPos == -1);
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
}
/* add neon wal reader socket to wait event set */
static void
add_nwr_event_set(Safekeeper *sk, uint32 events)
{
Assert(sk->nwrEventPos == -1);
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
}
static void
@@ -1463,10 +1580,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
}
/*
* Update neon_walreader event.
* Can be called when nwr socket doesn't exist, does nothing in this case.
*/
static void
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
update_nwr_event_set(Safekeeper *sk, uint32 events)
{
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
/* eventPos = -1 when we don't have an event */
if (sk->nwrEventPos != -1)
ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
}
static void
walprop_pg_active_state_update_event_set(Safekeeper *sk)
{
uint32 sk_events;
uint32 nwr_events;
Assert(sk->state == SS_ACTIVE);
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
/*
* If we need to wait for neon_walreader, ensure we have up to date socket
* in the wait event set.
*/
if (sk->active_state == SS_ACTIVE_READ_WAL)
{
/*
* If conn is established and socket is thus stable, update the event
* directly; otherwise re-add it.
*/
if (sk->nwrConnEstablished)
{
Assert(sk->nwrEventPos != -1);
update_nwr_event_set(sk, nwr_events);
}
else
{
rm_safekeeper_event_set(sk, false);
add_nwr_event_set(sk, nwr_events);
}
}
else
{
/*
* Hack: we should always set 0 here, but for random reasons
* WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
* some event. Since there is also no way to remove socket except
* reconstructing the whole set, SafekeeperStateDesiredEvents instead
* gives WL_SOCKET_CLOSED if socket exists. We never expect it to
* trigger.
*
* On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
* removal.
*/
#if PG_VERSION_NUM >= 150000
Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
update_nwr_event_set(sk, WL_SOCKET_CLOSED);
#else /* pg 14 */
rm_safekeeper_event_set(sk, false);
#endif
}
walprop_pg_update_event_set(sk, sk_events);
}
static void
walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
{
rm_safekeeper_event_set(to_remove, true);
}
/*
* A hacky way to remove single event from the event set. Can be called if event
* doesn't exist, does nothing in this case.
*
* Note: Internally, this completely reconstructs the event set. It should be
* avoided if possible.
*
* If is_sk is true, socket of connection to safekeeper is removed; otherwise
* socket of neon_walreader.
*/
static void
rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
{
WalProposer *wp = to_remove->wp;
wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
to_remove->host, to_remove->port, is_sk);
/*
* Shortpath for exiting if have nothing to do. We never call this
* function with safekeeper socket not existing, but do that with neon
* walreader socket.
*/
if ((is_sk && to_remove->eventPos == -1) ||
(!is_sk && to_remove->nwrEventPos == -1))
{
return;
}
/* Remove the existing event set, assign sk->eventPos = -1 */
walprop_pg_free_event_set(wp);
/* Re-initialize it without adding any safekeeper events */
wp->api.init_event_set(wp);
/*
* loop through the existing safekeepers. If they aren't the one we're
* removing, and if they have a socket we can use, re-add the applicable
* events.
*/
for (int i = 0; i < wp->n_safekeepers; i++)
{
Safekeeper *sk = &wp->safekeeper[i];
/*
* If this safekeeper isn't offline, add events for it, except for the
* event requested to remove.
*/
if (sk->state != SS_OFFLINE)
{
uint32 sk_events;
uint32 nwr_events;
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
if (sk != to_remove || !is_sk)
{
/* will set sk->eventPos */
wp->api.add_safekeeper_event_set(sk, sk_events);
}
if ((sk != to_remove || is_sk) && nwr_events)
{
add_nwr_event_set(sk, nwr_events);
}
}
}
}
static int
@@ -1484,8 +1735,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
/*
* Now that we prepared the condvar, check flush ptr again -- it might have
* changed before we subscribed to cv so we missed the wakeup.
* Now that we prepared the condvar, check flush ptr again -- it might
* have changed before we subscribed to cv so we missed the wakeup.
*
* Do that only when we're interested in new WAL: without sync-safekeepers
* and if election already passed.
@@ -1548,7 +1799,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
}
/*
* Get PageserverFeedback fields from the most advanced safekeeper
* Choose most advanced PageserverFeedback and set it to *rf.
*/
static void
GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
@@ -1571,15 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
replication_feedback_set(rf);
wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
rf->currentClusterSize,
LSN_FORMAT_ARGS(rf->last_received_lsn),
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
rf->replytime);
}
/*
@@ -1619,63 +1868,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
hs->catalog_xmin = InvalidFullTransactionId;
}
/*
* Based on commitLsn and safekeeper responses including pageserver feedback,
* 1) Propagate cluster size received from ps to ensure the limit.
* 2) Propagate pageserver LSN positions to ensure backpressure limits.
* 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
* 4) Propagate hot standby feedback.
*
* None of that is functional in sync-safekeepers.
*/
static void
walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
{
HotStandbyFeedback hsFeedback;
XLogRecPtr diskConsistentLsn;
XLogRecPtr oldDiskConsistentLsn;
diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
if (wp->config->syncSafekeepers)
return;
if (!wp->config->syncSafekeepers)
oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
/* Get PageserverFeedback fields from the most advanced safekeeper */
GetLatestNeonFeedback(&quorumFeedback.rf, wp);
replication_feedback_set(&quorumFeedback.rf);
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
{
/* Get PageserverFeedback fields from the most advanced safekeeper */
GetLatestNeonFeedback(&quorumFeedback.rf, wp);
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
}
if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
{
if (commitLsn > quorumFeedback.flushLsn)
quorumFeedback.flushLsn = commitLsn;
/* advance the replication slot */
if (!wp->config->syncSafekeepers)
ProcessStandbyReply(
/* write_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/* flush_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/*
* Advance the replication slot to commitLsn. WAL before it is
* hardened and will be fetched from one of safekeepers by
* neon_walreader if needed.
*
* Also wakes up syncrep waiters.
*/
ProcessStandbyReply(
/* write_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/* flush_lsn - This is what durably stored in WAL service. */
quorumFeedback.flushLsn,
/*
* apply_lsn - This is what processed and durably saved at*
* pageserver.
*/
quorumFeedback.rf.disk_consistent_lsn,
walprop_pg_get_current_timestamp(wp), false);
/*
* apply_lsn - This is what processed and durably saved at*
* pageserver.
*/
quorumFeedback.rf.disk_consistent_lsn,
walprop_pg_get_current_timestamp(wp), false);
}
CombineHotStanbyFeedbacks(&hsFeedback, wp);
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
{
quorumFeedback.hs = hsFeedback;
if (!wp->config->syncSafekeepers)
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
XidFromFullTransactionId(hsFeedback.catalog_xmin),
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
ProcessStandbyHSFeedback(hsFeedback.ts,
XidFromFullTransactionId(hsFeedback.xmin),
EpochFromFullTransactionId(hsFeedback.xmin),
XidFromFullTransactionId(hsFeedback.catalog_xmin),
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
}
}
static void
walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
{
if (MyReplicationSlot)
PhysicalConfirmReceivedLocation(lsn);
}
static XLogRecPtr
walprop_pg_get_redo_start_lsn(WalProposer *wp)
{
@@ -1694,15 +1949,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
}
static void
walprop_pg_after_election(WalProposer *wp)
static XLogRecPtr
GetLogRepRestartLSN(WalProposer *wp)
{
FILE *f;
XLogRecPtr lrRestartLsn;
XLogRecPtr lrRestartLsn = InvalidXLogRecPtr;
/* We don't need to do anything in syncSafekeepers mode. */
if (wp->config->syncSafekeepers)
return;
return InvalidXLogRecPtr;
/*
* If there are active logical replication subscription we need to provide
@@ -1710,22 +1965,40 @@ walprop_pg_after_election(WalProposer *wp)
* replication slots.
*/
f = fopen("restart.lsn", "rb");
if (f != NULL && !wp->config->syncSafekeepers)
if (f != NULL)
{
size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
fclose(f);
if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
{
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
uint64 download_range_mb;
wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
/*
* If we need to download more than a max_slot_wal_keep_size,
* don't do it to avoid risk of exploding pg_wal. Logical
* replication won't work until recreated, but at least compute
* would start; this also follows max_slot_wal_keep_size
* semantics.
*/
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
{
wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
return InvalidXLogRecPtr;
}
/*
* start from the beginning of the segment to fetch page headers
* verifed by XLogReader
*/
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
}
}
return lrRestartLsn;
}
static const walproposer_api walprop_pg = {
@@ -1745,18 +2018,18 @@ static const walproposer_api walprop_pg = {
.conn_async_write = walprop_async_write,
.conn_blocking_write = walprop_blocking_write,
.recovery_download = WalProposerRecovery,
.wal_read = walprop_pg_wal_read,
.wal_reader_allocate = walprop_pg_wal_reader_allocate,
.free_event_set = walprop_pg_free_event_set,
.wal_read = walprop_pg_wal_read,
.wal_reader_events = walprop_pg_wal_reader_events,
.init_event_set = walprop_pg_init_event_set,
.update_event_set = walprop_pg_update_event_set,
.active_state_update_event_set = walprop_pg_active_state_update_event_set,
.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
.wait_event_set = walprop_pg_wait_event_set,
.strong_random = walprop_pg_strong_random,
.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
.log_internal = walprop_pg_log_internal,
.after_election = walprop_pg_after_election,
};

49
poetry.lock generated
View File

@@ -339,19 +339,19 @@ uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "boto3"
version = "1.26.16"
version = "1.34.11"
description = "The AWS SDK for Python"
optional = false
python-versions = ">= 3.7"
python-versions = ">= 3.8"
files = [
{file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
{file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
{file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
{file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
]
[package.dependencies]
botocore = ">=1.29.16,<1.30.0"
botocore = ">=1.34.11,<1.35.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.6.0,<0.7.0"
s3transfer = ">=0.10.0,<0.11.0"
[package.extras]
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -702,22 +702,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
[[package]]
name = "botocore"
version = "1.29.16"
version = "1.34.11"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">= 3.7"
python-versions = ">= 3.8"
files = [
{file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
{file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
{file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
{file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
]
[package.dependencies]
jmespath = ">=0.7.1,<2.0.0"
python-dateutil = ">=2.1,<3.0.0"
urllib3 = ">=1.25.4,<1.27"
urllib3 = [
{version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
{version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
]
[package.extras]
crt = ["awscrt (==0.14.0)"]
crt = ["awscrt (==0.19.19)"]
[[package]]
name = "botocore-stubs"
@@ -1889,13 +1892,13 @@ files = [
[[package]]
name = "pytest"
version = "7.3.1"
version = "7.4.4"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
{file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
]
[package.dependencies]
@@ -1907,7 +1910,7 @@ pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-asyncio"
@@ -2230,20 +2233,20 @@ files = [
[[package]]
name = "s3transfer"
version = "0.6.0"
version = "0.10.0"
description = "An Amazon S3 Transfer Manager"
optional = false
python-versions = ">= 3.7"
python-versions = ">= 3.8"
files = [
{file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
{file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
{file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
{file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
]
[package.dependencies]
botocore = ">=1.12.36,<2.0a.0"
botocore = ">=1.33.2,<2.0a.0"
[package.extras]
crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
[[package]]
name = "sarif-om"
@@ -2740,4 +2743,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"

View File

@@ -11,6 +11,7 @@ use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
use anyhow::bail;
@@ -95,12 +96,8 @@ struct ProxyCliArgs {
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
/// timeout for http connections
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
#[clap(flatten)]
sql_over_http: SqlOverHttpArgs,
/// timeout for scram authentication protocol
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
scram_protocol_timeout: tokio::time::Duration,
@@ -138,6 +135,36 @@ struct ProxyCliArgs {
disable_ip_check_for_http: bool,
}
#[derive(clap::Args, Clone, Copy, Debug)]
struct SqlOverHttpArgs {
/// timeout for http connection requests
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
/// How many connections to pool for each endpoint. Excess connections are discarded
#[clap(long, default_value_t = 20)]
sql_over_http_pool_max_conns_per_endpoint: usize,
/// How long pooled connections should remain idle for before closing
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
sql_over_http_idle_timeout: tokio::time::Duration,
/// Duration each shard will wait on average before a GC sweep.
/// A longer time will causes sweeps to take longer but will interfere less frequently.
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
sql_over_http_pool_gc_epoch: tokio::time::Duration,
/// How many shards should the global pool have. Must be a power of two.
/// More shards will introduce less contention for pool operations, but can
/// increase memory used by the pool
#[clap(long, default_value_t = 128)]
sql_over_http_pool_shards: usize,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy::logging::init().await?;
@@ -327,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
}
};
let http_config = HttpConfig {
timeout: args.sql_over_http_timeout,
pool_opt_in: args.sql_over_http_pool_opt_in,
request_timeout: args.sql_over_http.sql_over_http_timeout,
pool_options: GlobalConnPoolOptions {
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
pool_shards: args.sql_over_http.sql_over_http_pool_shards,
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
},
};
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,

View File

@@ -1,4 +1,4 @@
use crate::{auth, rate_limiter::RateBucketInfo};
use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
}
pub struct HttpConfig {
pub timeout: tokio::time::Duration,
pub pool_opt_in: bool,
pub request_timeout: tokio::time::Duration,
pub pool_options: GlobalConnPoolOptions,
}
pub struct AuthenticationConfig {

View File

@@ -11,7 +11,7 @@ use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
use std::{net::SocketAddr, sync::Arc};
use std::sync::Arc;
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{error, info, info_span, warn, Instrument};
@@ -141,7 +141,7 @@ impl Api {
// We'll set username and such later using the startup message.
// TODO: add more type safety (in progress).
let mut config = compute::ConnCfg::new();
config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
let node = NodeInfo {
config,
@@ -269,9 +269,10 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
Err(ApiError::Console { status, text })
}
fn parse_host_port(input: &str) -> Option<(String, u16)> {
let parsed: SocketAddr = input.parse().ok()?;
Some((parsed.ip().to_string(), parsed.port()))
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
let (host, port) = input.rsplit_once(':')?;
let ipv6_brackets: &[_] = &['[', ']'];
Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
}
#[cfg(test)]
@@ -279,9 +280,24 @@ mod tests {
use super::*;
#[test]
fn test_parse_host_port() {
fn test_parse_host_port_v4() {
let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
assert_eq!(host, "127.0.0.1");
assert_eq!(port, 5432);
}
#[test]
fn test_parse_host_port_v6() {
let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
assert_eq!(host, "2001:db8::1");
assert_eq!(port, 5432);
}
#[test]
fn test_parse_host_port_url() {
let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
.expect("failed to parse");
assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
assert_eq!(port, 5432);
}
}

View File

@@ -6,9 +6,13 @@ mod conn_pool;
mod sql_over_http;
mod websocket;
pub use conn_pool::GlobalConnPoolOptions;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
use rand::rngs::StdRng;
use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio_util::task::TaskTracker;
@@ -47,6 +51,11 @@ pub async fn task_main(
let conn_pool = conn_pool::GlobalConnPool::new(config);
let conn_pool2 = Arc::clone(&conn_pool);
tokio::spawn(async move {
conn_pool2.gc_worker(StdRng::from_entropy()).await;
});
// shutdown the connection pool
tokio::spawn({
let cancellation_token = cancellation_token.clone();

View File

@@ -1,15 +1,19 @@
use anyhow::{anyhow, Context};
use async_trait::async_trait;
use dashmap::DashMap;
use futures::future::poll_fn;
use futures::{future::poll_fn, Future};
use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
use once_cell::sync::Lazy;
use parking_lot::RwLock;
use pbkdf2::{
password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
Params, Pbkdf2,
};
use pq_proto::StartupMessageParams;
use prometheus::{exponential_buckets, register_histogram, Histogram};
use rand::Rng;
use smol_str::SmolStr;
use std::{collections::HashMap, net::IpAddr, sync::Arc};
use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
use std::{
fmt,
task::{ready, Poll},
@@ -18,7 +22,7 @@ use std::{
ops::Deref,
sync::atomic::{self, AtomicUsize},
};
use tokio::time;
use tokio::time::{self, Instant};
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
@@ -30,11 +34,10 @@ use crate::{
};
use crate::{compute, config};
use tracing::{error, warn, Span};
use tracing::{debug, error, warn, Span};
use tracing::{info, info_span, Instrument};
pub const APP_NAME: &str = "/sql_over_http";
const MAX_CONNS_PER_ENDPOINT: usize = 20;
#[derive(Debug, Clone)]
pub struct ConnInfo {
@@ -69,6 +72,77 @@ struct ConnPoolEntry {
pub struct EndpointConnPool {
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
total_conns: usize,
max_conns: usize,
_guard: IntCounterPairGuard,
}
impl EndpointConnPool {
fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
let Self {
pools, total_conns, ..
} = self;
pools
.get_mut(&db_user)
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
}
fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
let Self {
pools, total_conns, ..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
let new_len = pool.conns.len();
let removed = old_len - new_len;
*total_conns -= removed;
removed > 0
} else {
false
}
}
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
let conn_id = client.conn_id;
if client.inner.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < pool.max_conns {
// we create this db-user entry in get, so it should not be None
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
}
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
Ok(())
}
}
/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -87,6 +161,27 @@ pub struct DbUserConnPool {
password_hash: Option<PasswordHashString>,
}
impl DbUserConnPool {
fn clear_closed_clients(&mut self, conns: &mut usize) {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.inner.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
*conns -= removed;
}
fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
*conns -= 1;
}
conn
}
}
pub struct GlobalConnPool {
// endpoint -> per-endpoint connection pool
//
@@ -94,52 +189,127 @@ pub struct GlobalConnPool {
// pool as early as possible and release the lock.
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
/// Number of endpoint-connection pools
///
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
/// It's only used for diagnostics.
global_pool_size: AtomicUsize,
proxy_config: &'static crate::config::ProxyConfig,
}
#[derive(Debug, Clone, Copy)]
pub struct GlobalConnPoolOptions {
// Maximum number of connections per one endpoint.
// Can mix different (dbname, username) connections.
// When running out of free slots for a particular endpoint,
// falls back to opening a new connection for each request.
max_conns_per_endpoint: usize,
pub max_conns_per_endpoint: usize,
proxy_config: &'static crate::config::ProxyConfig,
pub gc_epoch: Duration,
// Using a lock to remove any race conditions.
// Eg cleaning up connections while a new connection is returned
closed: RwLock<bool>,
pub pool_shards: usize,
pub idle_timeout: Duration,
pub opt_in: bool,
}
pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_http_pool_reclaimation_lag_seconds",
"Time it takes to reclaim unused connection pools",
// 1us -> 65ms
exponential_buckets(1e-6, 2.0, 16).unwrap(),
)
.unwrap()
});
pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
register_int_counter_pair!(
"proxy_http_pool_endpoints_registered_total",
"Number of endpoints we have registered pools for",
"proxy_http_pool_endpoints_unregistered_total",
"Number of endpoints we have unregistered pools for",
)
.unwrap()
});
impl GlobalConnPool {
pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
let shards = config.http_config.pool_options.pool_shards;
Arc::new(Self {
global_pool: DashMap::new(),
global_pool: DashMap::with_shard_amount(shards),
global_pool_size: AtomicUsize::new(0),
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
proxy_config: config,
closed: RwLock::new(false),
})
}
pub fn shutdown(&self) {
*self.closed.write() = true;
// drops all strong references to endpoint-pools
self.global_pool.clear();
}
self.global_pool.retain(|_, endpoint_pool| {
let mut pool = endpoint_pool.write();
// by clearing this hashmap, we remove the slots that a connection can be returned to.
// when returning, it drops the connection if the slot doesn't exist
pool.pools.clear();
pool.total_conns = 0;
pub async fn gc_worker(&self, mut rng: impl Rng) {
let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
loop {
interval.tick().await;
false
let shard = rng.gen_range(0..self.global_pool.shards().len());
self.gc(shard);
}
}
fn gc(&self, shard: usize) {
debug!(shard, "pool: performing epoch reclamation");
// acquire a random shard lock
let mut shard = self.global_pool.shards()[shard].write();
let timer = GC_LATENCY.start_timer();
let current_len = shard.len();
shard.retain(|endpoint, x| {
// if the current endpoint pool is unique (no other strong or weak references)
// then it is currently not in use by any connections.
if let Some(pool) = Arc::get_mut(x.get_mut()) {
let EndpointConnPool {
pools, total_conns, ..
} = pool.get_mut();
// ensure that closed clients are removed
pools
.iter_mut()
.for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
// we only remove this pool if it has no active connections
if *total_conns == 0 {
info!("pool: discarding pool for endpoint {endpoint}");
return false;
}
}
true
});
let new_len = shard.len();
drop(shard);
timer.observe_duration();
let removed = current_len - new_len;
if removed > 0 {
let global_pool_size = self
.global_pool_size
.fetch_sub(removed, atomic::Ordering::Relaxed)
- removed;
info!("pool: performed global pool gc. size now {global_pool_size}");
}
}
pub async fn get(
self: &Arc<Self>,
conn_info: &ConnInfo,
conn_info: ConnInfo,
force_new: bool,
session_id: uuid::Uuid,
peer_addr: IpAddr,
@@ -147,15 +317,11 @@ impl GlobalConnPool {
let mut client: Option<ClientInner> = None;
let mut latency_timer = LatencyTimer::new("http");
let pool = if force_new {
None
} else {
Some((conn_info.clone(), self.clone()))
};
let mut hash_valid = false;
let mut endpoint_pool = Weak::new();
if !force_new {
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
endpoint_pool = Arc::downgrade(&pool);
let mut hash = None;
// find a pool entry by (dbname, username) if exists
@@ -180,12 +346,8 @@ impl GlobalConnPool {
// we will continue with the regular connection flow
if validate.is_ok() {
hash_valid = true;
let mut pool = pool.write();
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
if let Some(entry) = pool_entries.conns.pop() {
client = Some(entry.conn);
pool.total_conns -= 1;
}
if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
client = Some(entry.conn)
}
}
}
@@ -198,11 +360,12 @@ impl GlobalConnPool {
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(
self.proxy_config,
conn_info,
&conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
} else {
@@ -214,18 +377,19 @@ impl GlobalConnPool {
);
latency_timer.pool_hit();
latency_timer.success();
return Ok(Client::new(client, pool).await);
return Ok(Client::new(client, conn_info, endpoint_pool).await);
}
} else {
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
connect_to_compute(
self.proxy_config,
conn_info,
&conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
};
@@ -269,59 +433,7 @@ impl GlobalConnPool {
_ => {}
}
let new_client = new_client?;
Ok(Client::new(new_client, pool).await)
}
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
let conn_id = client.conn_id;
// We want to hold this open while we return. This ensures that the pool can't close
// while we are in the middle of returning the connection.
let closed = self.closed.read();
if *closed {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
return Ok(());
}
if client.inner.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < self.max_conns_per_endpoint {
// we create this db-user entry in get, so it should not be None
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
}
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
Ok(())
Ok(Client::new(new_client, conn_info, endpoint_pool).await)
}
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -334,6 +446,12 @@ impl GlobalConnPool {
let new_pool = Arc::new(RwLock::new(EndpointConnPool {
pools: HashMap::new(),
total_conns: 0,
max_conns: self
.proxy_config
.http_config
.pool_options
.max_conns_per_endpoint,
_guard: ENDPOINT_POOLS.guard(),
}));
// find or create a pool for this endpoint
@@ -363,9 +481,11 @@ impl GlobalConnPool {
}
struct TokioMechanism<'a> {
pool: Weak<RwLock<EndpointConnPool>>,
conn_info: &'a ConnInfo,
session_id: uuid::Uuid,
conn_id: uuid::Uuid,
idle: Duration,
}
#[async_trait]
@@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
timeout,
self.conn_id,
self.session_id,
self.pool.clone(),
self.idle,
)
.await
}
@@ -403,6 +525,7 @@ async fn connect_to_compute(
session_id: uuid::Uuid,
latency_timer: LatencyTimer,
peer_addr: IpAddr,
pool: Weak<RwLock<EndpointConnPool>>,
) -> anyhow::Result<ClientInner> {
let tls = config.tls_config.as_ref();
let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -447,6 +570,8 @@ async fn connect_to_compute(
conn_id,
conn_info,
session_id,
pool,
idle: config.http_config.pool_options.idle_timeout,
},
node_info,
&extra,
@@ -462,6 +587,8 @@ async fn connect_to_compute_once(
timeout: time::Duration,
conn_id: uuid::Uuid,
mut session: uuid::Uuid,
pool: Weak<RwLock<EndpointConnPool>>,
idle: Duration,
) -> Result<ClientInner, tokio_postgres::Error> {
let mut config = (*node_info.config).clone();
@@ -490,13 +617,29 @@ async fn connect_to_compute_once(
branch_id: node_info.aux.branch_id.clone(),
};
let db_user = conn_info.db_and_user();
tokio::spawn(
async move {
let _conn_gauge = conn_gauge;
let mut idle_timeout = pin!(tokio::time::sleep(idle));
poll_fn(move |cx| {
if matches!(rx.has_changed(), Ok(true)) {
session = *rx.borrow_and_update();
info!(%session, "changed session");
idle_timeout.as_mut().reset(Instant::now() + idle);
}
// 5 minute idle connection timeout
if idle_timeout.as_mut().poll(cx).is_ready() {
idle_timeout.as_mut().reset(Instant::now() + idle);
info!("connection idle");
if let Some(pool) = pool.clone().upgrade() {
// remove client from pool - should close the connection if it's idle.
// does nothing if the client is currently checked-out and in-use
if pool.write().remove_client(db_user.clone(), conn_id) {
info!("idle connection removed");
}
}
}
loop {
@@ -514,15 +657,25 @@ async fn connect_to_compute_once(
}
Some(Err(e)) => {
error!(%session, "connection error: {}", e);
return Poll::Ready(())
break
}
None => {
info!("connection closed");
return Poll::Ready(())
break
}
}
}
}).await
// remove from connection pool
if let Some(pool) = pool.clone().upgrade() {
if pool.write().remove_client(db_user.clone(), conn_id) {
info!("closed connection removed");
}
}
Poll::Ready(())
}).await;
}
.instrument(span)
);
@@ -552,23 +705,27 @@ pub struct Client {
conn_id: uuid::Uuid,
span: Span,
inner: Option<ClientInner>,
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool>>,
}
pub struct Discard<'a> {
conn_id: uuid::Uuid,
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool>>,
}
impl Client {
pub(self) async fn new(
inner: ClientInner,
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool>>,
) -> Self {
Self {
conn_id: inner.conn_id,
inner: Some(inner),
span: Span::current(),
conn_info,
pool,
}
}
@@ -577,6 +734,7 @@ impl Client {
inner,
pool,
conn_id,
conn_info,
span: _,
} = self;
(
@@ -586,6 +744,7 @@ impl Client {
.inner,
Discard {
pool,
conn_info,
conn_id: *conn_id,
},
)
@@ -601,14 +760,14 @@ impl Client {
impl Discard<'_> {
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
if status != ReadyForQueryStatus::Idle {
if let Some((conn_info, _)) = self.pool.take() {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
}
let conn_info = &self.conn_info;
if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
}
}
pub fn discard(&mut self) {
if let Some((conn_info, _)) = self.pool.take() {
let conn_info = &self.conn_info;
if std::mem::take(self.pool).strong_count() > 0 {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
}
}
@@ -628,16 +787,17 @@ impl Deref for Client {
impl Drop for Client {
fn drop(&mut self) {
let conn_info = self.conn_info.clone();
let client = self
.inner
.take()
.expect("client inner should not be removed");
if let Some((conn_info, conn_pool)) = self.pool.take() {
if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
let current_span = self.span.clone();
// return connection to the pool
tokio::task::spawn_blocking(move || {
let _span = current_span.enter();
let _ = conn_pool.put(&conn_info, client);
let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
});
}
}

View File

@@ -206,7 +206,7 @@ pub async fn handle(
config: &'static HttpConfig,
) -> Result<Response<Body>, ApiError> {
let result = tokio::time::timeout(
config.timeout,
config.request_timeout,
handle_inner(
config,
request,
@@ -278,7 +278,7 @@ pub async fn handle(
Err(_) => {
let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
config.timeout.as_secs()
config.request_timeout.as_secs()
);
error!(message);
json_response(
@@ -320,7 +320,8 @@ async fn handle_inner(
// Allow connection pooling only if explicitly requested
// or if we have decided that http pool is no longer opt-in
let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
let allow_pool =
!config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
// isolation level, read only and deferrable
@@ -359,7 +360,7 @@ async fn handle_inner(
let payload: Payload = serde_json::from_slice(&body)?;
let mut client = conn_pool
.get(&conn_info, !allow_pool, session_id, peer_addr)
.get(conn_info, !allow_pool, session_id, peer_addr)
.await?;
let mut response = Response::builder()

View File

@@ -6,7 +6,7 @@ authors = []
[tool.poetry.dependencies]
python = "^3.9"
pytest = "^7.3.1"
pytest = "^7.4.4"
psycopg2-binary = "^2.9.6"
typing-extensions = "^4.6.1"
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
Jinja2 = "^3.0.2"
types-requests = "^2.31.0.0"
types-psycopg2 = "^2.9.21.10"
boto3 = "^1.26.16"
boto3 = "^1.34.11"
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
moto = {extras = ["server"], version = "^4.1.2"}
backoff = "^2.2.1"

View File

@@ -6,6 +6,7 @@ license.workspace = true
[dependencies]
aws-sdk-s3.workspace = true
aws-smithy-async.workspace = true
either.workspace = true
tokio-rustls.workspace = true
anyhow.workspace = true
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
clap.workspace = true
tracing-appender = "0.2"
histogram = "0.7"
futures.workspace = true

View File

@@ -1,9 +1,12 @@
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use anyhow::Context;
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver_api::shard::ShardIndex;
use tracing::{error, info, warn};
use utils::generation::Generation;
use utils::id::TimelineId;
use crate::cloud_admin_api::BranchData;
use crate::metadata_stream::stream_listing;
@@ -40,7 +43,7 @@ impl TimelineAnalysis {
pub(crate) fn branch_cleanup_and_check_errors(
id: &TenantShardTimelineId,
s3_root: &RootTarget,
tenant_objects: &mut TenantObjectListing,
s3_active_branch: Option<&BranchData>,
console_branch: Option<BranchData>,
s3_data: Option<S3TimelineBlobData>,
@@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
match s3_data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation,
mut s3_layers,
index_part_generation: _index_part_generation,
s3_layers: _s3_layers,
} => {
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
result.errors.push(format!(
@@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors(
))
}
let layer_map_key = (layer, metadata.generation);
if !s3_layers.remove(&layer_map_key) {
if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
// FIXME: this will emit false positives if an index was
// uploaded concurrently with our scan. To make this check
// correct, we need to try sending a HEAD request for the
// layer we think is missing.
result.errors.push(format!(
"index_part.json contains a layer {}{} that is not present in remote storage",
layer_map_key.0.file_name(),
layer_map_key.1.get_suffix()
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
layer.file_name(),
metadata.generation.get_suffix(),
metadata.shard
))
}
}
let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
.into_iter()
.filter(|(_layer_name, gen)|
// A layer is only considered orphaned if it has a generation below
// the index. If the generation is >= the index, then the layer may
// be an upload from a running pageserver, or even an upload from
// a new generation that didn't upload an index yet.
//
// Even so, a layer that is not referenced by the index could just
// be something enqueued for deletion, so while this check is valid
// for indicating that a layer is garbage, it is not an indicator
// of a problem.
gen < &index_part_generation)
.collect();
if !orphan_layers.is_empty() {
// An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
// these as a hint that there is something worth cleaning up here.
result.warnings.push(format!(
"index_part.json does not contain layers from S3: {:?}",
orphan_layers
.iter()
.map(|(layer_name, gen)| format!(
"{}{}",
layer_name.file_name(),
gen.get_suffix()
))
.collect::<Vec<_>>(),
));
result.garbage_keys.extend(orphan_layers.iter().map(
|(layer_name, layer_gen)| {
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
let delimiter = s3_root.delimiter();
if !key.ends_with(delimiter) {
key.push_str(delimiter);
}
key.push_str(&format!(
"{}{}",
&layer_name.file_name(),
layer_gen.get_suffix()
));
key
},
));
}
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors(
result
}
#[derive(Default)]
pub(crate) struct LayerRef {
ref_count: usize,
}
/// Top-level index of objects in a tenant. This may be used by any shard-timeline within
/// the tenant to query whether an object exists.
#[derive(Default)]
pub(crate) struct TenantObjectListing {
shard_timelines:
HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
}
impl TenantObjectListing {
/// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
/// list of layer keys for the Tenant.
pub(crate) fn push(
&mut self,
ttid: TenantShardTimelineId,
layers: HashSet<(LayerFileName, Generation)>,
) {
let shard_index = ShardIndex::new(
ttid.tenant_shard_id.shard_number,
ttid.tenant_shard_id.shard_count,
);
let replaced = self.shard_timelines.insert(
(shard_index, ttid.timeline_id),
layers
.into_iter()
.map(|l| (l, LayerRef::default()))
.collect(),
);
assert!(
replaced.is_none(),
"Built from an S3 object listing, which should never repeat a key"
);
}
/// Having loaded a timeline index, check if a layer referenced by the index exists. If it does,
/// the layer's refcount will be incremented. Later, after calling this for all references in all indices
/// in a tenant, orphan layers may be detected by their zero refcounts.
///
/// Returns true if the layer exists
pub(crate) fn check_ref(
&mut self,
timeline_id: TimelineId,
layer_file: &LayerFileName,
metadata: &IndexLayerMetadata,
) -> bool {
let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
return false;
};
let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
return false;
};
layer_ref.ref_count += 1;
true
}
pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
let mut result = Vec::new();
for ((shard_index, timeline_id), layers) in &self.shard_timelines {
for ((layer_file, generation), layer_ref) in layers {
if layer_ref.ref_count == 0 {
result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
}
}
}
result
}
}
#[derive(Debug)]
pub(crate) struct S3TimelineBlobData {
pub(crate) blob_data: BlobDataParseResult,

View File

@@ -15,10 +15,13 @@ use anyhow::Context;
use aws_config::environment::EnvironmentVariableCredentialsProvider;
use aws_config::imds::credentials::ImdsCredentialsProvider;
use aws_config::meta::credentials::CredentialsProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::retry::RetryConfig;
use aws_config::sso::SsoCredentialsProvider;
use aws_config::BehaviorVersion;
use aws_sdk_s3::config::Region;
use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
use aws_sdk_s3::{Client, Config};
use aws_smithy_async::rt::sleep::TokioSleep;
use clap::ValueEnum;
use pageserver::tenant::TENANTS_SEGMENT_NAME;
@@ -255,6 +258,11 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
let chain = CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder().build(),
);
// Use SSO if we were given an account ID
@@ -265,7 +273,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
.account_id(sso_account)
.role_name("PowerUserAccess")
.start_url("https://neondb.awsapps.com/start")
.region(Region::from_static("eu-central-1"))
.region(bucket_region.clone())
.build(),
),
None => chain,
@@ -277,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
)
};
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
let mut builder = Config::builder()
.behavior_version(BehaviorVersion::v2023_11_09())
.region(bucket_region)
.retry_config(RetryConfig::adaptive().with_max_attempts(3))
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.credentials_provider(credentials_provider);
if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {

View File

@@ -1,3 +1,4 @@
use pageserver_api::shard::TenantShardId;
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use s3_scrubber::scan_metadata::scan_metadata;
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
@@ -34,6 +35,8 @@ enum Command {
ScanMetadata {
#[arg(short, long, default_value_t = false)]
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
},
}
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
Command::ScanMetadata { json, tenant_ids } => {
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
Ok(summary) => {
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}
}
}
},
}
Command::FindGarbage {
node_kind,
depth,

View File

@@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet};
use crate::checks::{
branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
TimelineAnalysis,
TenantObjectListing, TimelineAnalysis,
};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use aws_sdk_s3::Client;
use futures_util::{pin_mut, StreamExt, TryStreamExt};
use histogram::Histogram;
use pageserver::tenant::remote_timeline_client::remote_layer_path;
use pageserver::tenant::IndexPart;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use utils::id::TenantId;
#[derive(Serialize)]
pub struct MetadataSummary {
count: usize,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
with_garbage: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
layer_count: MinMaxHisto,
@@ -87,7 +90,7 @@ impl MetadataSummary {
count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_garbage: HashSet::new(),
with_orphans: HashSet::new(),
indices_by_version: HashMap::new(),
layer_count: MinMaxHisto::new(),
timeline_size_bytes: MinMaxHisto::new(),
@@ -141,6 +144,10 @@ impl MetadataSummary {
}
}
fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
self.with_orphans.insert(*ttid);
}
/// Long-form output for printing at end of a scan
pub fn summary_string(&self) -> String {
let version_summary: String = itertools::join(
@@ -154,7 +161,7 @@ impl MetadataSummary {
"Timelines: {0}
With errors: {1}
With warnings: {2}
With garbage: {3}
With orphan layers: {3}
Index versions: {version_summary}
Timeline size bytes: {4}
Layer size bytes: {5}
@@ -163,7 +170,7 @@ Timeline layer count: {6}
self.count,
self.with_errors.len(),
self.with_warnings.len(),
self.with_garbage.len(),
self.with_orphans.len(),
self.timeline_size_bytes.oneline(),
self.layer_size_bytes.oneline(),
self.layer_count.oneline(),
@@ -180,10 +187,17 @@ Timeline layer count: {6}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
pub async fn scan_metadata(
bucket_config: BucketConfig,
tenant_ids: Vec<TenantShardId>,
) -> anyhow::Result<MetadataSummary> {
let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
let tenants = stream_tenants(&s3_client, &target);
let tenants = if tenant_ids.is_empty() {
futures::future::Either::Left(stream_tenants(&s3_client, &target))
} else {
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
};
// How many tenants to process in parallel. We need to be mindful of pageservers
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
@@ -191,7 +205,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
// Generate a stream of TenantTimelineId
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
let timelines = timelines.try_buffered(CONCURRENCY);
let timelines = timelines.try_flatten();
// Generate a stream of S3TimelineBlobData
@@ -204,17 +218,118 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
Ok((ttid, data))
}
let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
let timelines = timelines.try_buffered(CONCURRENCY);
// We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
// shards in the same tenant might refer to one anothers' keys if a shard split has happened.
let mut tenant_id = None;
let mut tenant_objects = TenantObjectListing::default();
let mut tenant_timeline_results = Vec::new();
fn analyze_tenant(
tenant_id: TenantId,
summary: &mut MetadataSummary,
mut tenant_objects: TenantObjectListing,
timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
) {
let mut timeline_generations = HashMap::new();
for (ttid, data) in timelines {
// Stash the generation of each timeline, for later use identifying orphan layers
if let BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation,
s3_layers: _s3_layers,
} = &data.blob_data
{
timeline_generations.insert(ttid, *index_part_generation);
}
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
// reference counts for layers across the tenant.
let analysis =
branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
summary.update_analysis(&ttid, &analysis);
}
// Identifying orphan layers must be done on a tenant-wide basis, because individual
// shards' layers may be referenced by other shards.
//
// Orphan layers are not a corruption, and not an indication of a problem. They are just
// consuming some space in remote storage, and may be cleaned up at leisure.
for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
let ttid = TenantShardTimelineId {
tenant_shard_id: TenantShardId {
tenant_id,
shard_count: shard_index.shard_count,
shard_number: shard_index.shard_number,
},
timeline_id,
};
if let Some(timeline_generation) = timeline_generations.get(&ttid) {
if &generation >= timeline_generation {
// Candidate orphan layer is in the current or future generation relative
// to the index we read for this timeline shard, so its absence from the index
// doesn't make it an orphan: more likely, it is a case where the layer was
// uploaded, but the index referencing the layer wasn't written yet.
continue;
}
}
let orphan_path = remote_layer_path(
&tenant_id,
&timeline_id,
shard_index,
&layer_file,
generation,
);
tracing::info!("Orphan layer detected: {orphan_path}");
summary.notify_timeline_orphan(&ttid);
}
}
// Iterate through all the timeline results. These are in key-order, so
// all results for the same tenant will be adjacent. We accumulate these,
// and then call `analyze_tenant` to flush, when we see the next tenant ID.
let mut summary = MetadataSummary::new();
pin_mut!(timelines);
while let Some(i) = timelines.next().await {
let (ttid, data) = i?;
summary.update_data(&data);
let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
match tenant_id {
None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
Some(prev_tenant_id) => {
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
let tenant_objects = std::mem::take(&mut tenant_objects);
let timelines = std::mem::take(&mut tenant_timeline_results);
analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
}
}
}
summary.update_analysis(&ttid, &analysis);
if let BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation: _index_part_generation,
s3_layers,
} = &data.blob_data
{
tenant_objects.push(ttid, s3_layers.clone());
}
tenant_timeline_results.push((ttid, data));
}
if !tenant_timeline_results.is_empty() {
analyze_tenant(
tenant_id.expect("Must be set if results are present"),
&mut summary,
tenant_objects,
tenant_timeline_results,
);
}
Ok(summary)

View File

@@ -4,6 +4,12 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
[dependencies]
async-stream.workspace = true
anyhow.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
clap = { workspace = true, features = ["derive"] }
const_format.workspace = true
crc32c.workspace = true
fail.workspace = true
fs2.workspace = true
git-version.workspace = true
hex.workspace = true

View File

@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
];
fn version() -> String {
format!(
"{GIT_VERSION} failpoints: {}, features: {:?}",
fail::has_failpoints(),
FEATURES,
)
}
const ABOUT: &str = r#"
A fleet of safekeepers is responsible for reliably storing WAL received from
compute, passing it through consensus (mitigating potential computes brain
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
// getting 'argument cannot be used multiple times' error. This seems to be
// impossible with pure Derive API, so convert struct to Command, modify it,
// parse arguments, and then fill the struct back.
let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
let cmd = <Args as clap::CommandFactory>::command()
.args_override_self(true)
.version(version());
let mut matches = cmd.get_matches();
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;

View File

@@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use tokio::fs::File;
use tokio::io::AsyncReadExt;
use tokio_util::sync::CancellationToken;
use utils::failpoint_support::failpoints_handler;
use std::io::Write as _;
use tokio::sync::mpsc;
@@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
.data(Arc::new(conf))
.data(auth)
.get("/v1/status", |r| request_span(r, status_handler))
.put("/v1/failpoints", |r| {
request_span(r, move |r| async {
let cancel = CancellationToken::new();
failpoints_handler(r, cancel).await
})
})
// Will be used in the future instead of implicit timeline creation
.post("/v1/tenant/timeline", |r| {
request_span(r, timeline_create_handler)

View File

@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
use serde::{Deserialize, Serialize};
use tokio::io::{AsyncRead, AsyncWrite};
use utils::failpoint_support;
use utils::id::TenantTimelineId;
use utils::lsn::AtomicLsn;
use utils::pageserver_feedback::PageserverFeedback;
@@ -391,15 +392,8 @@ impl SafekeeperPostgresHandler {
// application_name: give only committed WAL (used by pageserver) or all
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
// The second case is always driven by a consensus leader which term
// must generally be also supplied. However we're sloppy to do this in
// walproposer recovery which will be removed soon. So TODO is to make
// it not Option'al then.
//
// Fetching WAL without term in recovery creates a small risk of this
// WAL getting concurrently garbaged if another compute rises which
// collects majority and starts fixing log on this safekeeper itself.
// That's ok as (old) proposer will never be able to commit such WAL.
let end_watch = if self.is_walproposer_recovery() {
// must be supplied.
let end_watch = if term.is_some() {
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
} else {
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
@@ -535,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
);
// try to send as much as available, capped by MAX_SEND_SIZE
let mut send_size = self
.end_pos
.checked_sub(self.start_pos)
.context("reading wal without waiting for it first")?
.0 as usize;
send_size = min(send_size, self.send_buf.len());
let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
// if we went behind available WAL, back off
if chunk_end_pos >= self.end_pos {
chunk_end_pos = self.end_pos;
} else {
// If sending not up to end pos, round down to page boundary to
// avoid breaking WAL record not at page boundary, as protocol
// demands. See walsender.c (XLogSendPhysical).
chunk_end_pos = chunk_end_pos
.checked_sub(chunk_end_pos.block_offset())
.unwrap();
}
let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
let send_buf = &mut self.send_buf[..send_size];
let send_size: usize;
{
@@ -551,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
} else {
None
};
// read wal into buffer
// Read WAL into buffer. send_size can be additionally capped to
// segment boundary here.
send_size = self.wal_reader.read(send_buf).await?
};
let send_buf = &send_buf[..send_size];
@@ -566,6 +568,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
}))
.await?;
if let Some(appname) = &self.appname {
if appname == "replica" {
failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
}
}
trace!(
"sent {} bytes of WAL {}-{}",
send_size,

View File

@@ -565,6 +565,9 @@ impl WalReader {
})
}
/// Read WAL at current position into provided buf, returns number of bytes
/// read. It can be smaller than buf size only if segment boundary is
/// reached.
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
// If this timeline is new, we may not have a full segment yet, so
// we pad the first bytes of the timeline's first WAL segment with 0s

View File

@@ -1,2 +1,4 @@
result
*.json
hosts
poetry.lock

View File

@@ -0,0 +1,11 @@
[defaults]
host_key_checking = False
inventory=./hosts
remote_tmp=/tmp
remote_user=developer
callbacks_enabled = profile_tasks
[ssh_connection]
scp_if_ssh = True
ssh_args = -F ./ssh.cfg
pipelining = True

View File

@@ -0,0 +1,16 @@
[tool.poetry]
name = "sk-collect-dumps"
version = "0.1.0"
description = ""
authors = ["Arseny Sher <sher-ars@yandex.ru>"]
readme = "README.md"
packages = [{include = "sk_collect_dumps"}]
[tool.poetry.dependencies]
python = "^3.11"
ansible = "^9.1.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

View File

@@ -1,25 +1,43 @@
# Collect /v1/debug_dump from all safekeeper nodes
1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
## How to use ansible (staging)
3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
```
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
# staging:
AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# prod:
AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# check
echo $AUTH_TOKEN
```
2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
There are two ways to do that, with ssm or tsh. ssm:
```
# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
```
It will put the results to .results directory *near the playbook*.
tsh:
Update the inventory, if needed, selecting .build/.tech and optionally region:
```
rm -f hosts && echo '[safekeeper]' >> hosts
# staging:
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
# prod:
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
```
## How to use ansible (prod)
Test ansible connection:
```
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
ansible all -m ping -v
```
Download the dumps:
```
mkdir -p result && rm -f result/*
ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
```
3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.

View File

@@ -1,18 +1,37 @@
- name: Fetch state dumps from safekeepers
hosts: safekeepers
hosts: safekeeper
gather_facts: False
remote_user: "{{ remote_user }}"
tasks:
- name: Download file
- name: Dump file
get_url:
url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
dest: "/tmp/{{ inventory_hostname }}.json"
dest: "/tmp/{{ inventory_hostname }}-dump.json"
headers:
Authorization: "Bearer {{ auth_token }}"
- name: Fetch file from remote hosts
- name: install rsync
ansible.builtin.apt:
name: rsync
update_cache: yes
become: yes
ignore_errors: true # it can be already installed and we don't always have sudo
- name: Fetch file from remote hosts (works only with ssm)
fetch:
src: "/tmp/{{ inventory_hostname }}.json"
dest: "./result/{{ inventory_hostname }}.json"
src: "/tmp/{{ inventory_hostname }}-dump.json"
dest: "./result/{{ inventory_hostname }}-dump.json"
flat: yes
fail_on_missing: no
when: ansible_connection == "aws_ssm"
# xxx not sure how to make ansible 'synchronize' work with tsh
- name: Fetch file from remote hosts
shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json"
delegate_to: localhost
when: ansible_connection != "aws_ssm"
- name: remove remote dumps
ansible.builtin.file:
path: "/tmp/{{ inventory_hostname }}-dump.json"
state: absent

View File

@@ -0,0 +1,13 @@
# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
# Common flags for all teleport.aws.neon.tech hosts
Host *
HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
# Flags for all teleport.aws.neon.tech hosts except the proxy
Host * !teleport.aws.neon.tech
Port 3022
ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
User developer
# End generated Teleport configuration

View File

@@ -31,22 +31,22 @@ SELECT
(data->>'tenant_id') AS tenant_id,
(data->>'timeline_id') AS timeline_id,
(data->'memory'->>'active')::bool AS active,
(data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
(data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
(data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
(data->'memory'->>'write_lsn')::bigint AS write_lsn,
(data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
(data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
(data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
(data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
(data->'memory'->>'num_computes')::bigint AS num_computes,
(data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
(data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
(data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
(data->'memory'->>'is_cancelled')::bool AS is_cancelled,
(data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
(data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
(data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
(data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
(data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
(data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
(data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
(data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
(data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
(data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
(data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
(data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
(data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
FROM tmp_json
EOF

View File

@@ -347,7 +347,9 @@ class PgProtocol:
"""
return self.safe_psql_many([query], **kwargs)[0]
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
def safe_psql_many(
self, queries: List[str], log_query=True, **kwargs: Any
) -> List[List[Tuple[Any, ...]]]:
"""
Execute queries against the node and return all rows.
This method passes all extra params to connstr.
@@ -356,7 +358,8 @@ class PgProtocol:
with closing(self.connect(**kwargs)) as conn:
with conn.cursor() as cur:
for query in queries:
log.info(f"Executing query: {query}")
if log_query:
log.info(f"Executing query: {query}")
cur.execute(query)
if cur.description is None:
@@ -365,6 +368,12 @@ class PgProtocol:
result.append(cur.fetchall())
return result
def safe_psql_scalar(self, query, log_query=True) -> Any:
"""
Execute query returning single row with single column.
"""
return self.safe_psql(query, log_query=log_query)[0][0]
@dataclass
class AuthKeys:
@@ -457,7 +466,6 @@ class NeonEnvBuilder:
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
self.enable_generations = True
self.scrub_on_exit = False
self.test_output_dir = test_output_dir
@@ -677,8 +685,7 @@ class NeonEnvBuilder:
pageserver.stop(immediate=True)
if self.env.attachment_service is not None:
self.env.attachment_service.stop(immediate=True)
self.env.attachment_service.stop(immediate=True)
cleanup_error = None
@@ -772,10 +779,9 @@ class NeonEnv:
self.initial_tenant = config.initial_tenant
self.initial_timeline = config.initial_timeline
self.control_plane_api: Optional[str] = None
self.attachment_service: Optional[NeonAttachmentService] = None
if config.enable_generations:
self.enable_generations()
attachment_service_port = self.port_distributor.get_port()
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
# Create a config file corresponding to the options
cfg: Dict[str, Any] = {
@@ -844,24 +850,11 @@ class NeonEnv:
log.info(f"Config: {cfg}")
self.neon_cli.init(cfg)
def enable_generations(self, start=False):
if not start:
# TODO: assert that we haven't `self.start()`ed yet
pass
assert self.control_plane_api is None
assert self.attachment_service is None
attachment_service_port = self.port_distributor.get_port()
self.control_plane_api = f"http://127.0.0.1:{attachment_service_port}"
self.attachment_service = NeonAttachmentService(self)
if start:
self.attachment_service.start()
def start(self):
# Start up broker, pageserver and all safekeepers
self.broker.try_start()
if self.attachment_service is not None:
self.attachment_service.start()
self.attachment_service.start()
for pageserver in self.pageservers:
pageserver.start()
@@ -900,8 +893,8 @@ class NeonEnv:
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
def get_pageserver_version(self) -> str:
bin_pageserver = str(self.neon_binpath / "pageserver")
def get_binary_version(self, binary_name: str) -> str:
bin_pageserver = str(self.neon_binpath / binary_name)
res = subprocess.run(
[bin_pageserver, "--version"],
check=True,
@@ -1589,6 +1582,7 @@ class Pagectl(AbstractNeonCli):
parsed = json.loads(res.stdout)
return IndexPartDump.from_json(parsed)
# class GetpageBenchLibpq(AbstractNeonCli):
# """
# A typed wrapper around the `getpage_bench_libpq` CLI.
@@ -1676,7 +1670,7 @@ class NeonPageserver(PgProtocol):
self.running = False
self.service_port = port
self.config_override = config_override
self.version = env.get_pageserver_version()
self.version = env.get_binary_version("pageserver")
# After a test finishes, we will scrape the log to see if there are any
# unexpected error messages. If your test expects an error, add it to
@@ -1853,20 +1847,19 @@ class NeonPageserver(PgProtocol):
"""
client = self.http_client()
return client.tenant_attach(
tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
tenant_id,
config,
config_null,
generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
)
def tenant_detach(self, tenant_id: TenantId):
if self.env.attachment_service is not None:
self.env.attachment_service.attach_hook_drop(tenant_id)
self.env.attachment_service.attach_hook_drop(tenant_id)
client = self.http_client()
return client.tenant_detach(tenant_id)
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
# This API is only for use when generations are enabled
assert self.env.attachment_service is not None
if config["mode"].startswith("Attached") and "generation" not in config:
config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
@@ -1892,26 +1885,15 @@ class NeonPageserver(PgProtocol):
generation: Optional[int] = None,
) -> TenantId:
if generation is None:
generation = self.maybe_get_generation(tenant_id)
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
client = self.http_client(auth_token=auth_token)
return client.tenant_create(tenant_id, conf, generation=generation)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
def maybe_get_generation(self, tenant_id: TenantId):
"""
For tests that would like to use an HTTP client directly instead of using
the `tenant_attach` and `tenant_create` helpers here: issue a generation
number for a tenant.
Returns None if the attachment service is not enabled (legacy mode)
"""
if self.env.attachment_service is not None:
return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
else:
return None
return client.tenant_load(
tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
)
def append_pageserver_param_overrides(
@@ -2771,6 +2753,13 @@ class Endpoint(PgProtocol):
):
self.stop()
# Checkpoints running endpoint and returns pg_wal size in MB.
def get_pg_wal_size(self):
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
self.safe_psql("checkpoint")
assert self.pgdata_dir is not None # please mypy
return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
class EndpointFactory:
"""An object representing multiple compute endpoints."""
@@ -2949,7 +2938,10 @@ class Safekeeper:
return res
def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
return SafekeeperHttpClient(
port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
)
def data_dir(self) -> str:
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2969,6 +2961,13 @@ class Safekeeper:
return segments
# Walreceiver as returned by sk's timeline status endpoint.
@dataclass
class Walreceiver:
conn_id: int
state: str
@dataclass
class SafekeeperTimelineStatus:
acceptor_epoch: int
@@ -2979,6 +2978,7 @@ class SafekeeperTimelineStatus:
backup_lsn: Lsn
peer_horizon_lsn: Lsn
remote_consistent_lsn: Lsn
walreceivers: List[Walreceiver]
@dataclass
@@ -2992,10 +2992,11 @@ class SafekeeperMetrics:
class SafekeeperHttpClient(requests.Session):
HTTPError = requests.HTTPError
def __init__(self, port: int, auth_token: Optional[str] = None):
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
super().__init__()
self.port = port
self.auth_token = auth_token
self.is_testing_enabled = is_testing_enabled
if auth_token is not None:
self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -3003,6 +3004,30 @@ class SafekeeperHttpClient(requests.Session):
def check_status(self):
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
def is_testing_enabled_or_skip(self):
if not self.is_testing_enabled:
pytest.skip("safekeeper was built without 'testing' feature")
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
self.is_testing_enabled_or_skip()
if isinstance(config_strings, tuple):
pairs = [config_strings]
else:
pairs = config_strings
log.info(f"Requesting config failpoints: {repr(pairs)}")
res = self.put(
f"http://localhost:{self.port}/v1/failpoints",
json=[{"name": name, "actions": actions} for name, actions in pairs],
)
log.info(f"Got failpoints request response code {res.status_code}")
res.raise_for_status()
res_json = res.json()
assert res_json is None
return res_json
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
params = params or {}
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
@@ -3040,6 +3065,7 @@ class SafekeeperHttpClient(requests.Session):
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
res.raise_for_status()
resj = res.json()
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
return SafekeeperTimelineStatus(
acceptor_epoch=resj["acceptor_state"]["epoch"],
pg_version=resj["pg_info"]["pg_version"],
@@ -3049,6 +3075,7 @@ class SafekeeperHttpClient(requests.Session):
backup_lsn=Lsn(resj["backup_lsn"]),
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
walreceivers=walreceivers,
)
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):

Some files were not shown because too many files have changed in this diff Show More