Compare commits

..

1 Commits

Author SHA1 Message Date
Anna Khanova
136ed19387 Test 2024-03-27 13:42:33 +01:00
245 changed files with 5696 additions and 11835 deletions

View File

@@ -22,7 +22,6 @@
!s3_scrubber/ !s3_scrubber/
!safekeeper/ !safekeeper/
!storage_broker/ !storage_broker/
!storage_controller/
!trace/ !trace/
!vendor/postgres-*/ !vendor/postgres-*/
!workspace_hack/ !workspace_hack/

View File

@@ -150,7 +150,7 @@ runs:
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database # and to keep files on the host to upload them to the database
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
# Generate redirect # Generate redirect
cat <<EOF > ${WORKDIR}/index.html cat <<EOF > ${WORKDIR}/index.html

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
outputs: outputs:
dsn: dsn:
description: 'Created Branch DSN (for main database)' description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15 default: 15
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
provisioner: provisioner:
desctiption: 'k8s-pod or k8s-neonvm' desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod' default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true required: true
api_host: api_host:
desctiption: 'Neon API host' desctiption: 'Neon API host'
default: console-stage.neon.build default: console.stage.neon.tech
runs: runs:
using: "composite" using: "composite"

View File

@@ -18,7 +18,6 @@ on:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }} group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: false
env: env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -147,16 +147,15 @@ jobs:
"neonvm-captest-new" "neonvm-captest-new"
], ],
"db_size": [ "10gb" ], "db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" }, { "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" }, { "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" }, { "platform": "neonvm-captest-new", "db_size": "50gb" }]
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}' }'
if [ "$(date +%A)" = "Saturday" ]; then if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]') { "platform": "rds-aurora", "db_size": "50gb"}]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
{ "platform": "rds-aurora" }]') { "platform": "rds-aurora" }]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
{ "platform": "rds-aurora", "scale": "10" }]') { "platform": "rds-aurora", "scale": "10" }]')
fi fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
neon-captest-reuse) neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;; ;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }} CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;; ;;
@@ -274,15 +270,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark init - name: Benchmark init
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: ClickBench benchmark - name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run TPC-H benchmark - name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()") QUERY="SELECT version();"
if [[ "${PLATFORM}" = "neon"* ]]; then if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id") QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
QUERIES+=("SHOW neon.timeline_id")
fi fi
psql ${CONNSTR} -c "${QUERY}"
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run user examples - name: Run user examples
uses: ./.github/actions/run-python-test-set uses: ./.github/actions/run-python-test-set

View File

@@ -21,7 +21,6 @@ defaults:
concurrency: concurrency:
group: build-build-tools-image-${{ inputs.image-tag }} group: build-build-tools-image-${{ inputs.image-tag }}
cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {} permissions: {}

View File

@@ -735,7 +735,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v3 - uses: docker/login-action@v3
with: with:
@@ -792,7 +792,7 @@ jobs:
run: | run: |
mkdir -p .docker-custom mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2 - uses: docker/setup-buildx-action@v3
with: with:
# Disable parallelism for docker buildkit. # Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -865,7 +865,7 @@ jobs:
run: run:
shell: sh -eu {0} shell: sh -eu {0}
env: env:
VM_BUILDER_VERSION: v0.28.1 VM_BUILDER_VERSION: v0.23.2
steps: steps:
- name: Checkout - name: Checkout
@@ -1127,15 +1127,15 @@ jobs:
-f deployProxy=false \ -f deployProxy=false \
-f deployStorage=true \ -f deployStorage=true \
-f deployStorageBroker=true \ -f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \ -f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \ -f deployStorage=true \
-f deployStorageBroker=true \ -f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1144,7 +1144,6 @@ jobs:
-f deployProxy=true \ -f deployProxy=true \
-f deployStorage=false \ -f deployStorage=false \
-f deployStorageBroker=false \ -f deployStorageBroker=false \
-f deployStorageController=false \
-f branch=main \ -f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}} \ -f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true -f deployPreprodRegion=true

View File

@@ -28,9 +28,7 @@ jobs:
- name: Get build-tools image tag for the current commit - name: Get build-tools image tag for the current commit
id: get-build-tools-tag id: get-build-tools-tag
env: env:
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs, COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
COMMIT_SHA: ${{ github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: | run: |
LAST_BUILD_TOOLS_SHA=$( LAST_BUILD_TOOLS_SHA=$(

View File

@@ -20,7 +20,6 @@ defaults:
concurrency: concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }} group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false
permissions: {} permissions: {}

View File

@@ -62,14 +62,14 @@ jobs:
trigger-e2e-tests: trigger-e2e-tests:
needs: [ tag ] needs: [ tag ]
runs-on: ubuntu-latest runs-on: [ self-hosted, gen3, small ]
env: env:
TAG: ${{ needs.tag.outputs.build-tag }} TAG: ${{ needs.tag.outputs.build-tag }}
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps: steps:
- name: check if ecr image are present - name: check if ecr image are present
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
run: | run: |
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
fi fi
done done
- name: Set e2e-platforms
id: e2e-platforms
env:
PR_NUMBER: ${{ github.event.pull_request.number }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Default set of platforms to run e2e tests on
platforms='["docker", "k8s"]'
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
# If the workflow run is not a pull request, add k8s-neonvm to the list.
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
case "$f" in
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)
# no-op
;;
esac
done
else
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
fi
echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
- name: Set PR's status to pending and request a remote CI test - name: Set PR's status to pending and request a remote CI test
env:
E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: | run: |
REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ REMOTE_REPO="${{ github.repository_owner }}/cloud"
--method POST \
--raw-field "state=pending" \
--raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
--raw-field "context=neon-cloud-e2e"
gh workflow --repo ${REMOTE_REPO} \ curl -f -X POST \
run testing.yml \ https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
--ref "main" \ -H "Accept: application/vnd.github.v3+json" \
--raw-field "ci_job_name=neon-cloud-e2e" \ --user "${{ secrets.CI_ACCESS_TOKEN }}" \
--raw-field "commit_hash=$COMMIT_SHA" \ --data \
--raw-field "remote_repo=${GITHUB_REPOSITORY}" \ "{
--raw-field "storage_image_tag=${TAG}" \ \"state\": \"pending\",
--raw-field "compute_image_tag=${TAG}" \ \"context\": \"neon-cloud-e2e\",
--raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
--raw-field "e2e-platforms=${E2E_PLATFORMS}" }"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${TAG}\",
\"compute_image_tag\": \"${TAG}\",
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
}
}"

View File

@@ -1,5 +1,5 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute /compute_tools/ @neondatabase/control-plane @neondatabase/compute
/storage_controller @neondatabase/storage /control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage /libs/remote_storage/ @neondatabase/storage

543
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ resolver = "2"
members = [ members = [
"compute_tools", "compute_tools",
"control_plane", "control_plane",
"control_plane/storcon_cli", "control_plane/attachment_service",
"pageserver", "pageserver",
"pageserver/compaction", "pageserver/compaction",
"pageserver/ctl", "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
"proxy", "proxy",
"safekeeper", "safekeeper",
"storage_broker", "storage_broker",
"storage_controller",
"s3_scrubber", "s3_scrubber",
"workspace_hack", "workspace_hack",
"trace", "trace",
@@ -44,7 +43,6 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] } anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6" arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.18" azure_core = "0.18"
azure_identity = "0.18" azure_identity = "0.18"
azure_storage = "0.18" azure_storage = "0.18"
@@ -98,7 +96,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1" humantime = "2.1"
humantime-serde = "1.1.1" humantime-serde = "1.1.1"
hyper = "0.14" hyper = "0.14"
hyper-tungstenite = "0.13.0" hyper-tungstenite = "0.11"
inotify = "0.10.2" inotify = "0.10.2"
ipnet = "2.9.0" ipnet = "2.9.0"
itertools = "0.10" itertools = "0.10"
@@ -107,8 +105,7 @@ lasso = "0.7"
leaky-bucket = "1.0.1" leaky-bucket = "1.0.1"
libc = "0.2" libc = "0.2"
md5 = "0.7.0" md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] } measured = { version = "0.0.13", features=["default", "lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8" memoffset = "0.8"
native-tls = "0.2" native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -161,7 +158,7 @@ svg_fmt = "0.4.1"
sync_wrapper = "0.1.2" sync_wrapper = "0.1.2"
tar = "0.4" tar = "0.4"
task-local-extensions = "0.1.4" task-local-extensions = "0.1.4"
test-context = "0.3" test-context = "0.1"
thiserror = "1.0" thiserror = "1.0"
tikv-jemallocator = "0.5" tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5" tikv-jemalloc-ctl = "0.5"

View File

@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& mv protoc/include/google /usr/local/include/google \ && mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc && rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM # LLVM
ENV LLVM_VERSION=17 ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

View File

@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
# Create remote extension download directory
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
# Install: # Install:
# libreadline8 for psql # libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check) # libicu67, locales for collations (including ICU and plpgsql_check)

View File

@@ -818,15 +818,9 @@ impl ComputeNode {
Client::connect(zenith_admin_connstr.as_str(), NoTls) Client::connect(zenith_admin_connstr.as_str(), NoTls)
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
// Disable forwarding so that users don't get a cloud_admin role // Disable forwarding so that users don't get a cloud_admin role
client.simple_query("SET neon.forward_ddl = false")?;
let mut func = || { client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("SET neon.forward_ddl = false")?; client.simple_query("GRANT zenith_admin TO cloud_admin")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
Ok::<_, anyhow::Error>(())
};
func().context("apply_config setup cloud_admin")?;
drop(client); drop(client);
// reconnect with connstring with expected name // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
}; };
// Disable DDL forwarding because control plane already knows about these roles/databases. // Disable DDL forwarding because control plane already knows about these roles/databases.
client client.simple_query("SET neon.forward_ddl = false")?;
.simple_query("SET neon.forward_ddl = false")
.context("apply_config SET neon.forward_ddl = false")?;
// Proceed with post-startup configuration. Note, that order of operations is important. // Proceed with post-startup configuration. Note, that order of operations is important.
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; create_neon_superuser(spec, &mut client)?;
cleanup_instance(&mut client).context("apply_config cleanup_instance")?; cleanup_instance(&mut client)?;
handle_roles(spec, &mut client).context("apply_config handle_roles")?; handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client).context("apply_config handle_databases")?; handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client) handle_role_deletions(spec, connstr.as_str(), &mut client)?;
.context("apply_config handle_role_deletions")?;
handle_grants( handle_grants(
spec, spec,
&mut client, &mut client,
connstr.as_str(), connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension), self.has_feature(ComputeFeature::AnonExtension),
) )?;
.context("apply_config handle_grants")?; handle_extensions(spec, &mut client)?;
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; handle_extension_neon(&mut client)?;
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; create_availability_check_data(&mut client)?;
create_availability_check_data(&mut client)
.context("apply_config create_availability_check_data")?;
// 'Close' connection // 'Close' connection
drop(client); drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
// Run migrations separately to not hold up cold starts // Run migrations separately to not hold up cold starts
thread::spawn(move || { thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?; let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client).context("apply_config handle_migrations") handle_migrations(&mut client)
}); });
Ok(()) Ok(())
} }
@@ -1273,12 +1262,10 @@ LIMIT 100",
.await .await
.map_err(DownloadError::Other); .map_err(DownloadError::Other);
if download_size.is_ok() { self.ext_download_progress
self.ext_download_progress .write()
.write() .expect("bad lock")
.expect("bad lock") .insert(ext_archive_name.to_string(), (download_start, true));
.insert(ext_archive_name.to_string(), (download_start, true));
}
download_size download_size
} }

View File

@@ -6,8 +6,8 @@ use std::path::Path;
use anyhow::Result; use anyhow::Result;
use crate::pg_helpers::escape_conf_value; use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; use compute_api::spec::{ComputeMode, ComputeSpec};
/// Check that `line` is inside a text file and put it there if it is not. /// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist. /// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
} }
} }
if cfg!(target_os = "linux") {
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
// disabled), then the control plane has enabled swap and we should set
// dynamic_shared_memory_type = 'mmap'.
//
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
// ignore any errors - they may be expected to occur under certain situations (e.g. when
// not running in Linux).
.unwrap_or_else(|_| String::new());
if overcommit_memory_contents.trim() == "2" {
let opt = GenericOption {
name: "dynamic_shared_memory_type".to_owned(),
value: Some("mmap".to_owned()),
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
}
}
// If there are any extra options in the 'settings' field, append those // If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() { if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
format!("'{}'", res) format!("'{}'", res)
} }
pub trait GenericOptionExt { trait GenericOptionExt {
fn to_pg_option(&self) -> String; fn to_pg_option(&self) -> String;
fn to_pg_setting(&self) -> String; fn to_pg_setting(&self) -> String;
} }

View File

@@ -2,7 +2,7 @@ use std::fs::File;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::FromStr;
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{anyhow, bail, Result};
use postgres::config::Config; use postgres::config::Config;
use postgres::{Client, NoTls}; use postgres::{Client, NoTls};
use reqwest::StatusCode; use reqwest::StatusCode;
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
RoleAction::Create => { RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is // This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser. // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
let mut query: String = format!( let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
name.pg_quote() name.pg_quote()
); );
info!("running role create query: '{}'", &query); info!("running role create query: '{}'", &query);
@@ -698,8 +698,7 @@ pub fn handle_grants(
// it is important to run this after all grants // it is important to run this after all grants
if enable_anon_extension { if enable_anon_extension {
handle_extension_anon(spec, &db.owner, &mut db_client, false) handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
.context("handle_grants handle_extension_anon")?;
} }
} }
@@ -744,24 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
// which may happen in two cases: // which may happen in two cases:
// - extension was just installed // - extension was just installed
// - extension was already installed and is up to date // - extension was already installed and is up to date
let query = "ALTER EXTENSION neon UPDATE"; // DISABLED due to compute node unpinning epic
info!("update neon extension version with query: {}", query); // let query = "ALTER EXTENSION neon UPDATE";
if let Err(e) = client.simple_query(query) { // info!("update neon extension version with query: {}", query);
error!( // client.simple_query(query)?;
"failed to upgrade neon extension during `handle_extension_neon`: {}",
e
);
}
Ok(()) Ok(())
} }
#[instrument(skip_all)] #[instrument(skip_all)]
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
info!("handle neon extension upgrade"); info!("handle neon extension upgrade (not really)");
let query = "ALTER EXTENSION neon UPDATE"; // DISABLED due to compute node unpinning epic
info!("update neon extension version with query: {}", query); // let query = "ALTER EXTENSION neon UPDATE";
client.simple_query(query)?; // info!("update neon extension version with query: {}", query);
// client.simple_query(query)?;
Ok(()) Ok(())
} }
@@ -810,40 +806,43 @@ $$;"#,
"", "",
"", "",
"", "",
"",
// Add new migrations below. // Add new migrations below.
r#"
DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END
$$;"#,
]; ];
let mut func = || { let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; client.simple_query(query)?;
client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?; client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?; client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?; client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?; client.simple_query(query)?;
Ok::<_, anyhow::Error>(())
};
func().context("handle_migrations prepare")?;
let query = "SELECT id FROM neon_migration.migration_id"; query = "SELECT id FROM neon_migration.migration_id";
let row = client let row = client.query_one(query, &[])?;
.query_one(query, &[])
.context("handle_migrations get migration_id")?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize; let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration; let starting_migration_id = current_migration;
let query = "BEGIN"; query = "BEGIN";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations begin")?;
while current_migration < migrations.len() { while current_migration < migrations.len() {
let migration = &migrations[current_migration]; let migration = &migrations[current_migration];
@@ -851,9 +850,7 @@ $$;"#,
info!("Skip migration id={}", current_migration); info!("Skip migration id={}", current_migration);
} else { } else {
info!("Running migration:\n{}\n", migration); info!("Running migration:\n{}\n", migration);
client.simple_query(migration).with_context(|| { client.simple_query(migration)?;
format!("handle_migrations current_migration={}", current_migration)
})?;
} }
current_migration += 1; current_migration += 1;
} }
@@ -861,14 +858,10 @@ $$;"#,
"UPDATE neon_migration.migration_id SET id={}", "UPDATE neon_migration.migration_id SET id={}",
migrations.len() migrations.len()
); );
client client.simple_query(&setval)?;
.simple_query(&setval)
.context("handle_migrations update id")?;
let query = "COMMIT"; query = "COMMIT";
client client.simple_query(query)?;
.simple_query(query)
.context("handle_migrations commit")?;
info!( info!(
"Ran {} migrations", "Ran {} migrations",

View File

@@ -1,5 +1,5 @@
[package] [package]
name = "storage_controller" name = "attachment_service"
version = "0.1.0" version = "0.1.0"
edition.workspace = true edition.workspace = true
license.workspace = true license.workspace = true
@@ -25,7 +25,6 @@ git-version.workspace = true
hex.workspace = true hex.workspace = true
hyper.workspace = true hyper.workspace = true
humantime.workspace = true humantime.workspace = true
itertools.workspace = true
lasso.workspace = true lasso.workspace = true
once_cell.workspace = true once_cell.workspace = true
pageserver_api.workspace = true pageserver_api.workspace = true
@@ -45,8 +44,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" } diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" } r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" } utils = { path = "../../libs/utils/" }
metrics = { path = "../libs/metrics/" } metrics = { path = "../../libs/metrics/" }
control_plane = { path = "../control_plane" } control_plane = { path = ".." }
workspace_hack = { version = "0.1", path = "../workspace_hack" } workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,4 +1,3 @@
use std::sync::Arc;
use std::{collections::HashMap, time::Duration}; use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,32 +14,19 @@ use utils::{
use crate::service::Config; use crate::service::Config;
const BUSY_DELAY: Duration = Duration::from_secs(1);
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub(crate) const API_CONCURRENCY: usize = 32; pub(crate) const API_CONCURRENCY: usize = 32;
struct UnshardedComputeHookTenant {
// Which node is this tenant attached to
node_id: NodeId,
// Must hold this lock to send a notification.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
struct ShardedComputeHookTenant { struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize, stripe_size: ShardStripeSize,
shard_count: ShardCount, shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>, shards: Vec<(ShardNumber, NodeId)>,
// Must hold this lock to send a notification. The contents represent
// the last successfully sent notification, and are used to coalesce multiple
// updates by only sending when there is a chance since our last successful send.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
} }
enum ComputeHookTenant { enum ComputeHookTenant {
Unsharded(UnshardedComputeHookTenant), Unsharded(NodeId),
Sharded(ShardedComputeHookTenant), Sharded(ShardedComputeHookTenant),
} }
@@ -52,20 +38,9 @@ impl ComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)], shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size, stripe_size,
shard_count: tenant_shard_id.shard_count, shard_count: tenant_shard_id.shard_count,
send_lock: Arc::default(),
}) })
} else { } else {
Self::Unsharded(UnshardedComputeHookTenant { Self::Unsharded(node_id)
node_id,
send_lock: Arc::default(),
})
}
}
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
match self {
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
} }
} }
@@ -78,8 +53,8 @@ impl ComputeHookTenant {
node_id: NodeId, node_id: NodeId,
) { ) {
match self { match self {
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
unsharded_tenant.node_id = node_id *existing_node_id = node_id
} }
Self::Sharded(sharded_tenant) Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size if sharded_tenant.stripe_size == stripe_size
@@ -106,14 +81,14 @@ impl ComputeHookTenant {
} }
} }
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] #[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequestShard { struct ComputeHookNotifyRequestShard {
node_id: NodeId, node_id: NodeId,
shard_number: ShardNumber, shard_number: ShardNumber,
} }
/// Request body that we send to the control plane to notify it of where a tenant is attached /// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] #[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest { struct ComputeHookNotifyRequest {
tenant_id: TenantId, tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>, stripe_size: Option<ShardStripeSize>,
@@ -146,44 +121,14 @@ pub(crate) enum NotifyError {
Fatal(StatusCode), Fatal(StatusCode),
} }
enum MaybeSendResult {
// Please send this request while holding the lock, and if you succeed then write
// the request into the lock.
Transmit(
(
ComputeHookNotifyRequest,
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
),
),
// Something requires sending, but you must wait for a current sender then call again
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
// Nothing requires sending
Noop,
}
impl ComputeHookTenant { impl ComputeHookTenant {
fn maybe_send( fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
&self, match self {
tenant_id: TenantId, Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
) -> MaybeSendResult {
let locked = match lock {
Some(already_locked) => already_locked,
None => {
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
};
locked
}
};
let request = match self {
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
tenant_id, tenant_id,
shards: vec![ComputeHookNotifyRequestShard { shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0), shard_number: ShardNumber(0),
node_id: unsharded_tenant.node_id, node_id: *node_id,
}], }],
stripe_size: None, stripe_size: None,
}), }),
@@ -207,25 +152,12 @@ impl ComputeHookTenant {
// Sharded tenant doesn't yet have information for all its shards // Sharded tenant doesn't yet have information for all its shards
tracing::info!( tracing::info!(
"ComputeHookTenant::maybe_send: not enough shards ({}/{})", "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
sharded_tenant.shards.len(), sharded_tenant.shards.len(),
sharded_tenant.shard_count.count() sharded_tenant.shard_count.count()
); );
None None
} }
};
match request {
None => {
// Not yet ready to emit a notification
tracing::info!("Tenant isn't yet ready to emit a notification");
MaybeSendResult::Noop
}
Some(request) if Some(&request) == locked.as_ref() => {
// No change from the last value successfully sent
MaybeSendResult::Noop
}
Some(request) => MaybeSendResult::Transmit((request, locked)),
} }
} }
} }
@@ -235,19 +167,8 @@ impl ComputeHookTenant {
/// the compute connection string. /// the compute connection string.
pub(super) struct ComputeHook { pub(super) struct ComputeHook {
config: Config, config: Config,
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>, state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>, authorization_header: Option<String>,
// Concurrency limiter, so that we do not overload the cloud control plane when updating
// large numbers of tenants (e.g. when failing over after a node failure)
api_concurrency: tokio::sync::Semaphore,
// This lock is only used in testing enviroments, to serialize calls into neon_lock
neon_local_lock: tokio::sync::Mutex<()>,
// We share a client across all notifications to enable connection re-use etc when
// sending large numbers of notifications
client: reqwest::Client,
} }
impl ComputeHook { impl ComputeHook {
@@ -257,30 +178,18 @@ impl ComputeHook {
.clone() .clone()
.map(|jwt| format!("Bearer {}", jwt)); .map(|jwt| format!("Bearer {}", jwt));
let client = reqwest::ClientBuilder::new()
.timeout(NOTIFY_REQUEST_TIMEOUT)
.build()
.expect("Failed to construct HTTP client");
Self { Self {
state: Default::default(), state: Default::default(),
config, config,
authorization_header, authorization_header,
neon_local_lock: Default::default(),
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
client,
} }
} }
/// For test environments: use neon_local's LocalEnv to update compute /// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local( async fn do_notify_local(
&self, &self,
reconfigure_request: &ComputeHookNotifyRequest, reconfigure_request: ComputeHookNotifyRequest,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
let env = match LocalEnv::load_config() { let env = match LocalEnv::load_config() {
Ok(e) => e, Ok(e) => e,
Err(e) => { Err(e) => {
@@ -297,7 +206,7 @@ impl ComputeHook {
} = reconfigure_request; } = reconfigure_request;
let compute_pageservers = shards let compute_pageservers = shards
.iter() .into_iter()
.map(|shard| { .map(|shard| {
let ps_conf = env let ps_conf = env
.get_pageserver_conf(shard.node_id) .get_pageserver_conf(shard.node_id)
@@ -309,10 +218,10 @@ impl ComputeHook {
.collect::<Vec<_>>(); .collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints { for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,); tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint endpoint
.reconfigure(compute_pageservers.clone(), *stripe_size) .reconfigure(compute_pageservers.clone(), stripe_size)
.await?; .await?;
} }
} }
@@ -322,11 +231,12 @@ impl ComputeHook {
async fn do_notify_iteration( async fn do_notify_iteration(
&self, &self,
client: &reqwest::Client,
url: &String, url: &String,
reconfigure_request: &ComputeHookNotifyRequest, reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> Result<(), NotifyError> { ) -> Result<(), NotifyError> {
let req = self.client.request(Method::PUT, url); let req = client.request(Method::PUT, url);
let req = if let Some(value) = &self.authorization_header { let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value) req.header(reqwest::header::AUTHORIZATION, value)
} else { } else {
@@ -370,10 +280,11 @@ impl ComputeHook {
Err(NotifyError::SlowDown) Err(NotifyError::SlowDown)
} }
StatusCode::LOCKED => { StatusCode::LOCKED => {
// We consider this fatal, because it's possible that the operation blocking the control one is // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
// also the one that is waiting for this reconcile. We should let the reconciler calling // is not appropriate
// this hook fail, to give control plane a chance to un-lock. tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
tracing::info!("Control plane reports tenant is locked, dropping out of notify"); .await
.ok();
Err(NotifyError::Busy) Err(NotifyError::Busy)
} }
StatusCode::SERVICE_UNAVAILABLE StatusCode::SERVICE_UNAVAILABLE
@@ -389,27 +300,13 @@ impl ComputeHook {
async fn do_notify( async fn do_notify(
&self, &self,
url: &String, url: &String,
reconfigure_request: &ComputeHookNotifyRequest, reconfigure_request: ComputeHookNotifyRequest,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> Result<(), NotifyError> { ) -> Result<(), NotifyError> {
// We hold these semaphore units across all retries, rather than only across each let client = reqwest::Client::new();
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
// time out waiting for a semaphore.
let _units = self
.api_concurrency
.acquire()
.await
// Interpret closed semaphore as shutdown
.map_err(|_| NotifyError::ShuttingDown)?;
backoff::retry( backoff::retry(
|| self.do_notify_iteration(url, reconfigure_request, cancel), || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|e| { |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
matches!(
e,
NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
)
},
3, 3,
10, 10,
"Send compute notification", "Send compute notification",
@@ -443,70 +340,42 @@ impl ComputeHook {
stripe_size: ShardStripeSize, stripe_size: ShardStripeSize,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> Result<(), NotifyError> { ) -> Result<(), NotifyError> {
let maybe_send_result = { let mut locked = self.state.lock().await;
let mut state_locked = self.state.lock().unwrap();
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new( Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id, tenant_shard_id,
stripe_size, stripe_size,
node_id, node_id,
)), )),
Entry::Occupied(e) => { Entry::Occupied(e) => {
let tenant = e.into_mut(); let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id); tenant.update(tenant_shard_id, stripe_size, node_id);
tenant tenant
} }
};
tenant.maybe_send(tenant_shard_id.tenant_id, None)
}; };
// Process result: we may get an update to send, or we may have to wait for a lock let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
// before trying again. let Some(reconfigure_request) = reconfigure_request else {
let (request, mut send_lock_guard) = match maybe_send_result { // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
MaybeSendResult::Noop => { // until it does.
return Ok(()); tracing::info!("Tenant isn't yet ready to emit a notification");
} return Ok(());
MaybeSendResult::AwaitLock(send_lock) => {
let send_locked = send_lock.lock_owned().await;
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
// try_lock.
let state_locked = self.state.lock().unwrap();
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
return Ok(());
};
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
MaybeSendResult::AwaitLock(_) => {
unreachable!("We supplied lock guard")
}
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
}
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
}; };
let result = if let Some(notify_url) = &self.config.compute_hook_url { if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, &request, cancel).await self.do_notify(notify_url, reconfigure_request, cancel)
.await
} else { } else {
self.do_notify_local(&request).await.map_err(|e| { self.do_notify_local(reconfigure_request)
// This path is for testing only, so munge the error into our prod-style error type. .await
tracing::error!("Local notification hook failed: {e}"); .map_err(|e| {
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) // This path is for testing only, so munge the error into our prod-style error type.
}) tracing::error!("Local notification hook failed: {e}");
}; NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
if result.is_ok() {
// Before dropping the send lock, stash the request we just sent so that
// subsequent callers can avoid redundantly re-sending the same thing.
*send_lock_guard = Some(request);
} }
result
} }
} }
@@ -530,22 +399,21 @@ pub(crate) mod tests {
NodeId(1), NodeId(1),
); );
// An unsharded tenant is always ready to emit a notification, but won't // An unsharded tenant is always ready to emit a notification
// send the same one twice assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
let send_result = tenant_state.maybe_send(tenant_id, None); assert_eq!(
let MaybeSendResult::Transmit((request, mut guard)) = send_result else { tenant_state
anyhow::bail!("Wrong send result"); .maybe_reconfigure(tenant_id)
}; .unwrap()
assert_eq!(request.shards.len(), 1); .shards
assert!(request.stripe_size.is_none()); .len(),
1
// Simulate successful send );
*guard = Some(request); assert!(tenant_state
drop(guard); .maybe_reconfigure(tenant_id)
.unwrap()
// Try asking again: this should be a no-op .stripe_size
let send_result = tenant_state.maybe_send(tenant_id, None); .is_none());
assert!(matches!(send_result, MaybeSendResult::Noop));
// Writing the first shard of a multi-sharded situation (i.e. in a split) // Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to // resets the tenant state and puts it in an non-notifying state (need to
@@ -559,10 +427,7 @@ pub(crate) mod tests {
ShardStripeSize(32768), ShardStripeSize(32768),
NodeId(1), NodeId(1),
); );
assert!(matches!( assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
tenant_state.maybe_send(tenant_id, None),
MaybeSendResult::Noop
));
// Writing the second shard makes it ready to notify // Writing the second shard makes it ready to notify
tenant_state.update( tenant_state.update(
@@ -575,16 +440,22 @@ pub(crate) mod tests {
NodeId(1), NodeId(1),
); );
let send_result = tenant_state.maybe_send(tenant_id, None); assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
let MaybeSendResult::Transmit((request, mut guard)) = send_result else { assert_eq!(
anyhow::bail!("Wrong send result"); tenant_state
}; .maybe_reconfigure(tenant_id)
assert_eq!(request.shards.len(), 2); .unwrap()
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); .shards
.len(),
// Simulate successful send 2
*guard = Some(request); );
drop(guard); assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
Ok(()) Ok(())
} }

View File

@@ -8,7 +8,6 @@ use futures::Future;
use hyper::header::CONTENT_TYPE; use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response}; use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri}; use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::models::{ use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest, TenantTimeTravelRequest, TimelineCreateRequest,
@@ -35,8 +34,7 @@ use utils::{
}; };
use pageserver_api::controller_api::{ use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
TenantShardMigrateRequest,
}; };
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
@@ -45,19 +43,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware; use routerify::Middleware;
/// State available to HTTP request handlers /// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState { pub struct HttpState {
service: Arc<crate::service::Service>, service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<SwappableJwtAuth>>,
neon_metrics: NeonMetrics,
allowlist_routes: Vec<Uri>, allowlist_routes: Vec<Uri>,
} }
impl HttpState { impl HttpState {
pub fn new( pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"] let allowlist_routes = ["/status", "/ready", "/metrics"]
.iter() .iter()
.map(|v| v.parse().unwrap()) .map(|v| v.parse().unwrap())
@@ -65,7 +59,6 @@ impl HttpState {
Self { Self {
service, service,
auth, auth,
neon_metrics: NeonMetrics::new(build_info),
allowlist_routes, allowlist_routes,
} }
} }
@@ -405,15 +398,6 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
} }
async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
json_response(StatusCode::OK, service.tenant_list())
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
@@ -427,10 +411,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
check_permissions(&req, Scope::Admin)?; check_permissions(&req, Scope::Admin)?;
let state = get_state(&req); let state = get_state(&req);
let nodes = state.service.node_list().await?; json_response(StatusCode::OK, state.service.node_list().await?)
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
json_response(StatusCode::OK, api_nodes)
} }
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -497,22 +478,6 @@ async fn handle_tenant_shard_migrate(
) )
} }
async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state
.service
.tenant_update_policy(tenant_id, update_req)
.await?,
)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?; check_permissions(&req, Scope::PageServerApi)?;
@@ -544,14 +509,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, state.service.consistency_check().await?) json_response(StatusCode::OK, state.service.consistency_check().await?)
} }
async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
}
/// Status endpoint is just used for checking that our HTTP listener is up /// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> { async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
@@ -608,17 +565,9 @@ where
.await .await
} }
/// Check if the required scope is held in the request's token, or if the request has
/// a token with 'admin' scope then always permit it.
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> { fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| { check_permission_with(request, |claims| {
match crate::auth::check_permission(claims, required_scope) { crate::auth::check_permission(claims, required_scope)
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
Ok(()) => Ok(()),
Err(_) => Err(e),
},
Ok(()) => Ok(()),
}
}) })
} }
@@ -678,11 +627,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
}) })
} }
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> { pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode();
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
let response = Response::builder() let response = Response::builder()
.status(200) .status(200)
.header(CONTENT_TYPE, TEXT_FORMAT) .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -711,7 +659,6 @@ where
pub fn make_router( pub fn make_router(
service: Arc<Service>, service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>, auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> { ) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router() let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware()) .middleware(prologue_metrics_middleware())
@@ -728,7 +675,7 @@ pub fn make_router(
} }
router router
.data(Arc::new(HttpState::new(service, auth, build_info))) .data(Arc::new(HttpState::new(service, auth)))
.get("/metrics", |r| { .get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics")) named_request_span(r, measured_metrics_handler, RequestName("metrics"))
}) })
@@ -779,9 +726,6 @@ pub fn make_router(
RequestName("debug_v1_consistency_check"), RequestName("debug_v1_consistency_check"),
) )
}) })
.post("/debug/v1/reconcile_all", |r| {
request_span(r, handle_reconcile_all)
})
.put("/debug/v1/failpoints", |r| { .put("/debug/v1/failpoints", |r| {
request_span(r, |r| failpoints_handler(r, CancellationToken::new())) request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
}) })
@@ -821,16 +765,6 @@ pub fn make_router(
RequestName("control_v1_tenant_describe"), RequestName("control_v1_tenant_describe"),
) )
}) })
.get("/control/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
})
.put("/control/v1/tenant/:tenant_id/policy", |r| {
named_request_span(
r,
handle_tenant_update_policy,
RequestName("control_v1_tenant_policy"),
)
})
// Tenant operations // Tenant operations
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.

View File

@@ -14,7 +14,7 @@ mod reconciler;
mod scheduler; mod scheduler;
mod schema; mod schema;
pub mod service; pub mod service;
mod tenant_shard; mod tenant_state;
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
struct Sequence(u64); struct Sequence(u64);

View File

@@ -1,20 +1,18 @@
use anyhow::{anyhow, Context}; use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use camino::Utf8PathBuf; use camino::Utf8PathBuf;
use clap::Parser; use clap::Parser;
use diesel::Connection; use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp; use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc; use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use tokio::signal::unix::SignalKind; use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat}; use utils::logging::{self, LogFormat};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version, tcp_listener}; use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
@@ -52,7 +50,7 @@ struct Cli {
#[arg(short, long)] #[arg(short, long)]
path: Option<Utf8PathBuf>, path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)] #[arg(long)]
database_url: Option<String>, database_url: Option<String>,
@@ -160,8 +158,6 @@ fn main() -> anyhow::Result<()> {
std::process::exit(1); std::process::exit(1);
})); }));
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
tokio::runtime::Builder::new_current_thread() tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately // We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections. // as many blocking threads as we will open database connections.
@@ -193,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
args.listen args.listen
); );
let build_info = BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
};
let strict_mode = if args.dev { let strict_mode = if args.dev {
StrictMode::Dev StrictMode::Dev
} else { } else {
@@ -259,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets let auth = secrets
.public_key .public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth, build_info) let router = make_router(service.clone(), auth)
.build() .build()
.map_err(|err| anyhow!(err))?; .map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap(); let router_service = utils::http::RouterService::new(router).unwrap();

View File

@@ -8,8 +8,10 @@
//! The rest of the code defines label group types and deals with converting outer types to labels. //! The rest of the code defines label group types and deals with converting outer types to labels.
//! //!
use bytes::Bytes; use bytes::Bytes;
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; use measured::{
use metrics::NeonMetrics; label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use std::sync::Mutex; use std::sync::Mutex;
@@ -24,28 +26,21 @@ pub fn preinitialize_metrics() {
pub(crate) struct StorageControllerMetrics { pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup, pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::BufferedTextEncoder>, encoder: Mutex<measured::text::TextEncoder>,
} }
#[derive(measured::MetricGroup)] #[derive(measured::MetricGroup)]
#[metric(new())]
pub(crate) struct StorageControllerMetricGroup { pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task /// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter, pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled /// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete: pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>, measured::CounterVec<ReconcileCompleteLabelGroupSet>,
/// Count of how many times we make an optimization change to a tenant's scheduling
pub(crate) storage_controller_schedule_optimization: measured::Counter,
/// HTTP request status counters for handled requests /// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status: pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>, measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes /// HTTP request handler latency across all status codes
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_http_request_latency: pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>, measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
@@ -57,7 +52,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of HTTP requests to the pageserver, broken down by pageserver /// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful /// node id, request name and method. This include both successful and unsuccessful
/// requests. /// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_pageserver_request_latency: pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>, measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -69,7 +63,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful /// node id, request name and method. This include both successful and unsuccessful
/// requests. /// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_passthrough_request_latency: pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>, measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -78,34 +71,75 @@ pub(crate) struct StorageControllerMetricGroup {
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>, measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation. /// Latency of database queries, broken down by operation.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency: pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>, measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
} }
impl StorageControllerMetrics { impl StorageControllerMetrics {
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { pub(crate) fn encode(&self) -> Bytes {
let mut encoder = self.encoder.lock().unwrap(); let mut encoder = self.encoder.lock().unwrap();
neon_metrics self.metrics_group.collect_into(&mut *encoder);
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
encoder.finish() encoder.finish()
} }
} }
impl Default for StorageControllerMetrics { impl Default for StorageControllerMetrics {
fn default() -> Self { fn default() -> Self {
let mut metrics_group = StorageControllerMetricGroup::new();
metrics_group
.storage_controller_reconcile_complete
.init_all_dense();
Self { Self {
metrics_group, metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
} }
} }
} }
@@ -119,7 +153,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)] #[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> { pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
pub(crate) status: StatusCode, pub(crate) status: StatusCode,
@@ -128,21 +162,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)] #[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> { pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
} }
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)] #[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)] #[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> { pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) pageserver_id: &'a str, pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo, default)] #[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str, pub(crate) path: &'a str,
pub(crate) method: Method, pub(crate) method: Method,
} }
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)] #[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)] #[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup { pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -156,7 +209,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation, pub(crate) operation: DatabaseOperation,
} }
#[derive(FixedCardinalityLabel, Clone, Copy)] #[derive(FixedCardinalityLabel)]
pub(crate) enum ReconcileOutcome { pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")] #[label(rename = "ok")]
Success, Success,
@@ -164,7 +217,7 @@ pub(crate) enum ReconcileOutcome {
Cancel, Cancel,
} }
#[derive(FixedCardinalityLabel, Copy, Clone)] #[derive(FixedCardinalityLabel, Clone)]
pub(crate) enum Method { pub(crate) enum Method {
Get, Get,
Put, Put,
@@ -189,12 +242,11 @@ impl From<hyper::Method> for Method {
} }
} }
#[derive(Clone, Copy)]
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode { impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output { fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as i64) v.write_int(self.0.as_u16() as u64)
} }
} }
@@ -212,7 +264,7 @@ impl FixedCardinalityLabel for StatusCode {
} }
} }
#[derive(FixedCardinalityLabel, Clone, Copy)] #[derive(FixedCardinalityLabel)]
pub(crate) enum DatabaseErrorLabel { pub(crate) enum DatabaseErrorLabel {
Query, Query,
Connection, Connection,

View File

@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
use hyper::StatusCode; use hyper::StatusCode;
use pageserver_api::{ use pageserver_api::{
controller_api::{ controller_api::{
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
TenantLocateResponseShard,
}, },
shard::TenantShardId, shard::TenantShardId,
}; };
@@ -257,19 +256,6 @@ impl Node {
) )
.await .await
} }
/// Generate the simplified API-friendly description of a node's state
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
}
} }
impl std::fmt::Display for Node { impl std::fmt::Display for Node {

View File

@@ -9,7 +9,6 @@ use camino::Utf8PathBuf;
use diesel::pg::PgConnection; use diesel::pg::PgConnection;
use diesel::prelude::*; use diesel::prelude::*;
use diesel::Connection; use diesel::Connection;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig; use pageserver_api::models::TenantConfig;
use pageserver_api::shard::ShardConfigError; use pageserver_api::shard::ShardConfigError;
@@ -79,7 +78,7 @@ pub(crate) enum DatabaseError {
Logical(String), Logical(String),
} }
#[derive(measured::FixedCardinalityLabel, Copy, Clone)] #[derive(measured::FixedCardinalityLabel, Clone)]
pub(crate) enum DatabaseOperation { pub(crate) enum DatabaseOperation {
InsertNode, InsertNode,
UpdateNode, UpdateNode,
@@ -108,12 +107,6 @@ pub(crate) enum AbortShardSplitStatus {
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>; pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
/// Some methods can operate on either a whole tenant or a single shard
pub(crate) enum TenantFilter {
Tenant(TenantId),
Shard(TenantShardId),
}
impl Persistence { impl Persistence {
// The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under
// normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect.
@@ -147,13 +140,15 @@ impl Persistence {
/// Wraps `with_conn` in order to collect latency and error metrics /// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R> async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where where
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static, F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static, R: Send + 'static,
{ {
let latency = &METRICS_REGISTRY let latency = &METRICS_REGISTRY
.metrics_group .metrics_group
.storage_controller_database_query_latency; .storage_controller_database_query_latency;
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
operation: op.clone(),
});
let res = self.with_conn(func).await; let res = self.with_conn(func).await;
@@ -173,7 +168,7 @@ impl Persistence {
/// Call the provided function in a tokio blocking thread, with a Diesel database connection. /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R> async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
where where
F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static, F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
R: Send + 'static, R: Send + 'static,
{ {
let mut conn = self.connection_pool.get()?; let mut conn = self.connection_pool.get()?;
@@ -280,11 +275,6 @@ impl Persistence {
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
shard.placement_policy = "{\"Attached\":0}".to_string(); shard.placement_policy = "{\"Attached\":0}".to_string();
} }
if shard.scheduling_policy.is_empty() {
shard.scheduling_policy =
serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
}
} }
let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect(); let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -475,45 +465,59 @@ impl Persistence {
/// that we only do the first time a tenant is set to an attached policy via /location_config. /// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard( pub(crate) async fn update_tenant_shard(
&self, &self,
tenant: TenantFilter, tenant_shard_id: TenantShardId,
input_placement_policy: Option<PlacementPolicy>, input_placement_policy: PlacementPolicy,
input_config: Option<TenantConfig>, input_config: TenantConfig,
input_generation: Option<Generation>, input_generation: Option<Generation>,
input_scheduling_policy: Option<ShardSchedulingPolicy>,
) -> DatabaseResult<()> { ) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*; use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
let query = match tenant { let query = diesel::update(tenant_shards)
TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.into_boxed(),
TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.into_boxed(),
};
#[derive(AsChangeset)] if let Some(input_generation) = input_generation {
#[diesel(table_name = crate::schema::tenant_shards)] // Update includes generation column
struct ShardUpdate { query
generation: Option<i32>, .set((
placement_policy: Option<String>, generation.eq(Some(input_generation.into().unwrap() as i32)),
config: Option<String>, placement_policy
scheduling_policy: Option<String>, .eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} }
let update = ShardUpdate { Ok(())
generation: input_generation.map(|g| g.into().unwrap() as i32), })
placement_policy: input_placement_policy .await?;
.map(|p| serde_json::to_string(&p).unwrap()),
config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
scheduling_policy: input_scheduling_policy
.map(|p| serde_json::to_string(&p).unwrap()),
};
query.set(update).execute(conn)?; Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(()) Ok(())
}) })
@@ -694,7 +698,7 @@ impl Persistence {
} }
} }
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::tenant_shards)] #[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence { pub(crate) struct TenantShardPersistence {
@@ -724,8 +728,6 @@ pub(crate) struct TenantShardPersistence {
pub(crate) splitting: SplitState, pub(crate) splitting: SplitState,
#[serde(default)] #[serde(default)]
pub(crate) config: String, pub(crate) config: String,
#[serde(default)]
pub(crate) scheduling_policy: String,
} }
impl TenantShardPersistence { impl TenantShardPersistence {

View File

@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError}; use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node; use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: &str = "60s"; const DEFAULT_HEATMAP_PERIOD: &str = "60s";
/// Object with the lifetime of the background reconcile task that is created /// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states. /// for tenants which have a difference between their intent and observed states.
pub(super) struct Reconciler { pub(super) struct Reconciler {
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
/// of a tenant's state from when we spawned a reconcile task. /// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId, pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity, pub(crate) shard: ShardIdentity,
@@ -48,11 +48,11 @@ pub(super) struct Reconciler {
/// To avoid stalling if the cloud control plane is unavailable, we may proceed /// To avoid stalling if the cloud control plane is unavailable, we may proceed
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
pub(crate) compute_notify_failure: bool, pub(crate) compute_notify_failure: bool,
/// A means to abort background reconciliation: it is essential to /// A means to abort background reconciliation: it is essential to
/// call this when something changes in the original TenantShard that /// call this when something changes in the original TenantState that
/// will make this reconciliation impossible or unnecessary, for /// will make this reconciliation impossible or unnecessary, for
/// example when a pageserver node goes offline, or the PlacementPolicy for /// example when a pageserver node goes offline, or the PlacementPolicy for
/// the tenant is changed. /// the tenant is changed.
@@ -66,7 +66,7 @@ pub(super) struct Reconciler {
pub(crate) persistence: Arc<Persistence>, pub(crate) persistence: Arc<Persistence>,
} }
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with, /// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run. /// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)] #[derive(Debug)]
@@ -487,7 +487,6 @@ impl Reconciler {
while let Err(e) = self.compute_notify().await { while let Err(e) = self.compute_notify().await {
match e { match e {
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
_ => { _ => {
tracing::warn!( tracing::warn!(
"Live migration blocked by compute notification error, retrying: {e}" "Live migration blocked by compute notification error, retrying: {e}"

View File

@@ -1,4 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard}; use crate::{node::Node, tenant_state::TenantState};
use pageserver_api::controller_api::UtilizationScore; use pageserver_api::controller_api::UtilizationScore;
use serde::Serialize; use serde::Serialize;
use std::collections::HashMap; use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
#[derive(Serialize)] #[derive(Serialize)]
struct SchedulerNode { struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize, shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -58,86 +58,6 @@ pub(crate) struct Scheduler {
nodes: HashMap<NodeId, SchedulerNode>, nodes: HashMap<NodeId, SchedulerNode>,
} }
/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
///
/// For example, we may set an affinity score based on the number of shards from the same
/// tenant already on a node, to implicitly prefer to balance out shards.
#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
pub(crate) struct AffinityScore(pub(crate) usize);
impl AffinityScore {
/// If we have no anti-affinity at all toward a node, this is its score. It means
/// the scheduler has a free choice amongst nodes with this score, and may pick a node
/// based on other information such as total utilization.
pub(crate) const FREE: Self = Self(0);
pub(crate) fn inc(&mut self) {
self.0 += 1;
}
}
impl std::ops::Add for AffinityScore {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self(self.0 + rhs.0)
}
}
/// Hint for whether this is a sincere attempt to schedule, or a speculative
/// check for where we _would_ schedule (done during optimization)
#[derive(Debug)]
pub(crate) enum ScheduleMode {
Normal,
Speculative,
}
impl Default for ScheduleMode {
fn default() -> Self {
Self::Normal
}
}
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
// it for many shards in the same tenant.
#[derive(Debug, Default)]
pub(crate) struct ScheduleContext {
/// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
pub(crate) nodes: HashMap<NodeId, AffinityScore>,
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
pub(crate) mode: ScheduleMode,
}
impl ScheduleContext {
/// Input is a list of nodes we would like to avoid using again within this context. The more
/// times a node is passed into this call, the less inclined we are to use it.
pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
for node_id in nodes {
let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
entry.inc()
}
}
pub(crate) fn push_attached(&mut self, node_id: NodeId) {
let entry = self.attached_nodes.entry(node_id).or_default();
*entry += 1;
}
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
.copied()
.unwrap_or(AffinityScore::FREE)
}
pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
self.attached_nodes.get(&node_id).copied().unwrap_or(0)
}
}
impl Scheduler { impl Scheduler {
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self { pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
let mut scheduler_nodes = HashMap::new(); let mut scheduler_nodes = HashMap::new();
@@ -163,7 +83,7 @@ impl Scheduler {
pub(crate) fn consistency_check<'a>( pub(crate) fn consistency_check<'a>(
&self, &self,
nodes: impl Iterator<Item = &'a Node>, nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantShard>, shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new(); let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes { for node in nodes {
@@ -304,87 +224,53 @@ impl Scheduler {
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
} }
/// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
/// are already in use by this shard -- we use this to avoid picking the same node
/// as both attached and secondary location. This is a hard constraint: if we cannot
/// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
///
/// context: we prefer to avoid using nodes identified in the context, according
/// to their anti-affinity score. We use this to prefeer to avoid placing shards in
/// the same tenant on the same node. This is a soft constraint: the context will never
/// cause us to fail to schedule a shard.
pub(crate) fn schedule_shard(
&self,
hard_exclude: &[NodeId],
context: &ScheduleContext,
) -> Result<NodeId, ScheduleError> {
if self.nodes.is_empty() { if self.nodes.is_empty() {
return Err(ScheduleError::NoPageservers); return Err(ScheduleError::NoPageservers);
} }
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self let mut tenant_counts: Vec<(NodeId, usize)> = self
.nodes .nodes
.iter() .iter()
.filter_map(|(k, v)| { .filter_map(|(k, v)| {
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
None None
} else { } else {
Some(( Some((*k, v.shard_count))
*k,
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
v.shard_count,
))
} }
}) })
.collect(); .collect();
// Sort by, in order of precedence: // Sort by tenant count. Nodes with the same tenant count are sorted by ID.
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available tenant_counts.sort_by_key(|i| (i.1, i.0));
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.2, i.0));
if scores.is_empty() { if tenant_counts.is_empty() {
// After applying constraints, no pageservers were left. // After applying constraints, no pageservers were left. We log some detail about
if !matches!(context.mode, ScheduleMode::Speculative) { // the state of nodes to help understand why this happened. This is not logged as an error because
// If this was not a speculative attempt, log details to understand why we couldn't // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
// schedule: this may help an engineer understand if some nodes are marked offline tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
// in a way that's preventing progress. for (node_id, node) in &self.nodes {
tracing::info!( tracing::info!(
"Scheduling failure, while excluding {hard_exclude:?}, node states:" "Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
); );
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
);
}
} }
return Err(ScheduleError::ImpossibleConstraint); return Err(ScheduleError::ImpossibleConstraint);
} }
// Lowest score wins let node_id = tenant_counts.first().unwrap().0;
let node_id = scores.first().unwrap().0; tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
if !matches!(context.mode, ScheduleMode::Speculative) { tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
); );
}
// Note that we do not update shard count here to reflect the scheduling: that // Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used. // is IntentState's job when the scheduled location is used.
Ok(node_id) Ok(node_id)
} }
/// Unit test access to internal state
#[cfg(test)]
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
self.nodes.get(&node_id).unwrap().shard_count
}
} }
#[cfg(test)] #[cfg(test)]
@@ -421,7 +307,7 @@ pub(crate) mod test_utils {
mod tests { mod tests {
use super::*; use super::*;
use crate::tenant_shard::IntentState; use crate::tenant_state::IntentState;
#[test] #[test]
fn scheduler_basic() -> anyhow::Result<()> { fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2); let nodes = test_utils::make_test_nodes(2);
@@ -430,17 +316,15 @@ mod tests {
let mut t1_intent = IntentState::new(); let mut t1_intent = IntentState::new();
let mut t2_intent = IntentState::new(); let mut t2_intent = IntentState::new();
let context = ScheduleContext::default(); let scheduled = scheduler.schedule_shard(&[])?;
let scheduled = scheduler.schedule_shard(&[], &context)?;
t1_intent.set_attached(&mut scheduler, Some(scheduled)); t1_intent.set_attached(&mut scheduler, Some(scheduled));
let scheduled = scheduler.schedule_shard(&[], &context)?; let scheduled = scheduler.schedule_shard(&[])?;
t2_intent.set_attached(&mut scheduler, Some(scheduled)); t2_intent.set_attached(&mut scheduler, Some(scheduled));
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
t1_intent.push_secondary(&mut scheduler, scheduled); t1_intent.push_secondary(&mut scheduler, scheduled);
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);

View File

@@ -22,7 +22,6 @@ diesel::table! {
placement_policy -> Varchar, placement_policy -> Varchar,
splitting -> Int2, splitting -> Int2,
config -> Text, config -> Text,
scheduling_policy -> Varchar,
} }
} }

View File

@@ -8,10 +8,7 @@ use std::{
}; };
use crate::{ use crate::{
id_lock_map::IdLockMap, id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
persistence::{AbortShardSplitStatus, TenantFilter},
reconciler::ReconcileError,
scheduler::{ScheduleContext, ScheduleMode},
}; };
use anyhow::Context; use anyhow::Context;
use control_plane::storage_controller::{ use control_plane::storage_controller::{
@@ -20,14 +17,12 @@ use control_plane::storage_controller::{
use diesel::result::DatabaseErrorKind; use diesel::result::DatabaseErrorKind;
use futures::{stream::FuturesUnordered, StreamExt}; use futures::{stream::FuturesUnordered, StreamExt};
use hyper::StatusCode; use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::{ use pageserver_api::{
controller_api::{ controller_api::{
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, TenantShardMigrateResponse, UtilizationScore,
UtilizationScore,
}, },
models::{SecondaryProgress, TenantConfigRequest}, models::{SecondaryProgress, TenantConfigRequest},
}; };
@@ -56,6 +51,7 @@ use utils::{
generation::Generation, generation::Generation,
http::error::ApiError, http::error::ApiError,
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
seqwait::SeqWait,
sync::gate::Gate, sync::gate::Gate,
}; };
@@ -66,10 +62,11 @@ use crate::{
persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence},
reconciler::attached_location_conf, reconciler::attached_location_conf,
scheduler::Scheduler, scheduler::Scheduler,
tenant_shard::{ tenant_state::{
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
ReconcilerWaiter, TenantShard, ReconcilerWaiter, TenantState,
}, },
Sequence,
}; };
// For operations that should be quick, like attaching a new tenant // For operations that should be quick, like attaching a new tenant
@@ -92,7 +89,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
// Top level state available to all HTTP handlers // Top level state available to all HTTP handlers
struct ServiceState { struct ServiceState {
tenants: BTreeMap<TenantShardId, TenantShard>, tenants: BTreeMap<TenantShardId, TenantState>,
nodes: Arc<HashMap<NodeId, Node>>, nodes: Arc<HashMap<NodeId, Node>>,
@@ -102,7 +99,7 @@ struct ServiceState {
impl ServiceState { impl ServiceState {
fn new( fn new(
nodes: HashMap<NodeId, Node>, nodes: HashMap<NodeId, Node>,
tenants: BTreeMap<TenantShardId, TenantShard>, tenants: BTreeMap<TenantShardId, TenantState>,
scheduler: Scheduler, scheduler: Scheduler,
) -> Self { ) -> Self {
Self { Self {
@@ -116,7 +113,7 @@ impl ServiceState {
&mut self, &mut self,
) -> ( ) -> (
&mut Arc<HashMap<NodeId, Node>>, &mut Arc<HashMap<NodeId, Node>>,
&mut BTreeMap<TenantShardId, TenantShard>, &mut BTreeMap<TenantShardId, TenantState>,
&mut Scheduler, &mut Scheduler,
) { ) {
(&mut self.nodes, &mut self.tenants, &mut self.scheduler) (&mut self.nodes, &mut self.tenants, &mut self.scheduler)
@@ -335,11 +332,11 @@ impl Service {
for (tenant_shard_id, shard_observations) in observed { for (tenant_shard_id, shard_observations) in observed {
for (node_id, observed_loc) in shard_observations { for (node_id, observed_loc) in shard_observations {
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id)); cleanup.push((tenant_shard_id, node_id));
continue; continue;
}; };
tenant_shard tenant_state
.observed .observed
.locations .locations
.insert(node_id, ObservedStateLocation { conf: observed_loc }); .insert(node_id, ObservedStateLocation { conf: observed_loc });
@@ -347,15 +344,9 @@ impl Service {
} }
// Populate each tenant's intent state // Populate each tenant's intent state
let mut schedule_context = ScheduleContext::default(); for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { tenant_state.intent_from_observed(scheduler);
if tenant_shard_id.shard_number == ShardNumber(0) { if let Err(e) = tenant_state.schedule(scheduler) {
// Reset scheduling context each time we advance to the next Tenant
schedule_context = ScheduleContext::default();
}
tenant_shard.intent_from_observed(scheduler);
if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
// Non-fatal error: we are unable to properly schedule the tenant, perhaps because // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
// not enough pageservers are available. The tenant may well still be available // not enough pageservers are available. The tenant may well still be available
// to clients. // to clients.
@@ -364,11 +355,11 @@ impl Service {
// If we're both intending and observed to be attached at a particular node, we will // If we're both intending and observed to be attached at a particular node, we will
// emit a compute notification for this. In the case where our observed state does not // emit a compute notification for this. In the case where our observed state does not
// yet match our intent, we will eventually reconcile, and that will emit a compute notification. // yet match our intent, we will eventually reconcile, and that will emit a compute notification.
if let Some(attached_at) = tenant_shard.stably_attached() { if let Some(attached_at) = tenant_state.stably_attached() {
compute_notifications.push(( compute_notifications.push((
*tenant_shard_id, *tenant_shard_id,
attached_at, attached_at,
tenant_shard.shard.stripe_size, tenant_state.shard.stripe_size,
)); ));
} }
} }
@@ -679,13 +670,7 @@ impl Service {
let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
while !self.cancel.is_cancelled() { while !self.cancel.is_cancelled() {
tokio::select! { tokio::select! {
_ = interval.tick() => { _ = interval.tick() => { self.reconcile_all(); }
let reconciles_spawned = self.reconcile_all();
if reconciles_spawned == 0 {
// Run optimizer only when we didn't find any other work to do
self.optimize_all();
}
}
_ = self.cancel.cancelled() => return _ = self.cancel.cancelled() => return
} }
} }
@@ -743,7 +728,7 @@ impl Service {
/// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation
/// was successful, this will update the observed state of the tenant such that subsequent /// was successful, this will update the observed state of the tenant such that subsequent
/// calls to [`TenantShard::maybe_reconcile`] will do nothing. /// calls to [`TenantState::maybe_reconcile`] will do nothing.
#[instrument(skip_all, fields( #[instrument(skip_all, fields(
tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
sequence=%result.sequence sequence=%result.sequence
@@ -761,10 +746,10 @@ impl Service {
tenant.generation = std::cmp::max(tenant.generation, result.generation); tenant.generation = std::cmp::max(tenant.generation, result.generation);
// If the reconciler signals that it failed to notify compute, set this state on // If the reconciler signals that it failed to notify compute, set this state on
// the shard so that a future [`TenantShard::maybe_reconcile`] will try again. // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
tenant.pending_compute_notification = result.pending_compute_notification; tenant.pending_compute_notification = result.pending_compute_notification;
// Let the TenantShard know it is idle. // Let the TenantState know it is idle.
tenant.reconcile_complete(result.sequence); tenant.reconcile_complete(result.sequence);
match result.result { match result.result {
@@ -972,14 +957,30 @@ impl Service {
} }
for tsp in tenant_shard_persistence { for tsp in tenant_shard_persistence {
let tenant_shard_id = tsp.get_tenant_shard_id()?; let tenant_shard_id = tsp.get_tenant_shard_id()?;
let shard_identity = tsp.get_shard_identity()?;
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued. // it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new(); let mut intent = IntentState::new();
if let Some(generation_pageserver) = tsp.generation_pageserver { if let Some(generation_pageserver) = tsp.generation_pageserver {
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
} }
let new_tenant = TenantShard::from_persistent(tsp, intent)?;
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
splitting: tsp.splitting,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
};
tenants.insert(tenant_shard_id, new_tenant); tenants.insert(tenant_shard_id, new_tenant);
} }
@@ -1103,8 +1104,6 @@ impl Service {
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(), config: serde_json::to_string(&TenantConfig::default()).unwrap(),
splitting: SplitState::default(), splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
}; };
match self.persistence.insert_tenant_shards(vec![tsp]).await { match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1126,7 +1125,7 @@ impl Service {
let mut locked = self.inner.write().unwrap(); let mut locked = self.inner.write().unwrap();
locked.tenants.insert( locked.tenants.insert(
attach_req.tenant_shard_id, attach_req.tenant_shard_id,
TenantShard::new( TenantState::new(
attach_req.tenant_shard_id, attach_req.tenant_shard_id,
ShardIdentity::unsharded(), ShardIdentity::unsharded(),
PlacementPolicy::Attached(0), PlacementPolicy::Attached(0),
@@ -1157,10 +1156,9 @@ impl Service {
// when we reattaching a detached tenant. // when we reattaching a detached tenant.
self.persistence self.persistence
.update_tenant_shard( .update_tenant_shard(
TenantFilter::Shard(attach_req.tenant_shard_id), attach_req.tenant_shard_id,
Some(PlacementPolicy::Attached(0)), PlacementPolicy::Attached(0),
Some(conf), conf,
None,
None, None,
) )
.await?; .await?;
@@ -1178,32 +1176,32 @@ impl Service {
let mut locked = self.inner.write().unwrap(); let mut locked = self.inner.write().unwrap();
let (_nodes, tenants, scheduler) = locked.parts_mut(); let (_nodes, tenants, scheduler) = locked.parts_mut();
let tenant_shard = tenants let tenant_state = tenants
.get_mut(&attach_req.tenant_shard_id) .get_mut(&attach_req.tenant_shard_id)
.expect("Checked for existence above"); .expect("Checked for existence above");
if let Some(new_generation) = new_generation { if let Some(new_generation) = new_generation {
tenant_shard.generation = Some(new_generation); tenant_state.generation = Some(new_generation);
tenant_shard.policy = PlacementPolicy::Attached(0); tenant_state.policy = PlacementPolicy::Attached(0);
} else { } else {
// This is a detach notification. We must update placement policy to avoid re-attaching // This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during storage controller restart. // during background scheduling/reconciliation, or during storage controller restart.
assert!(attach_req.node_id.is_none()); assert!(attach_req.node_id.is_none());
tenant_shard.policy = PlacementPolicy::Detached; tenant_state.policy = PlacementPolicy::Detached;
} }
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
tracing::info!( tracing::info!(
tenant_id = %attach_req.tenant_shard_id, tenant_id = %attach_req.tenant_shard_id,
ps_id = %attaching_pageserver, ps_id = %attaching_pageserver,
generation = ?tenant_shard.generation, generation = ?tenant_state.generation,
"issuing", "issuing",
); );
} else if let Some(ps_id) = tenant_shard.intent.get_attached() { } else if let Some(ps_id) = tenant_state.intent.get_attached() {
tracing::info!( tracing::info!(
tenant_id = %attach_req.tenant_shard_id, tenant_id = %attach_req.tenant_shard_id,
%ps_id, %ps_id,
generation = ?tenant_shard.generation, generation = ?tenant_state.generation,
"dropping", "dropping",
); );
} else { } else {
@@ -1211,14 +1209,14 @@ impl Service {
tenant_id = %attach_req.tenant_shard_id, tenant_id = %attach_req.tenant_shard_id,
"no-op: tenant already has no pageserver"); "no-op: tenant already has no pageserver");
} }
tenant_shard tenant_state
.intent .intent
.set_attached(scheduler, attach_req.node_id); .set_attached(scheduler, attach_req.node_id);
tracing::info!( tracing::info!(
"attach_hook: tenant {} set generation {:?}, pageserver {}", "attach_hook: tenant {} set generation {:?}, pageserver {}",
attach_req.tenant_shard_id, attach_req.tenant_shard_id,
tenant_shard.generation, tenant_state.generation,
// TODO: this is an odd number of 0xf's // TODO: this is an odd number of 0xf's
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
); );
@@ -1230,36 +1228,36 @@ impl Service {
#[cfg(feature = "testing")] #[cfg(feature = "testing")]
{ {
if let Some(node_id) = attach_req.node_id { if let Some(node_id) = attach_req.node_id {
tenant_shard.observed.locations = HashMap::from([( tenant_state.observed.locations = HashMap::from([(
node_id, node_id,
ObservedStateLocation { ObservedStateLocation {
conf: Some(attached_location_conf( conf: Some(attached_location_conf(
tenant_shard.generation.unwrap(), tenant_state.generation.unwrap(),
&tenant_shard.shard, &tenant_state.shard,
&tenant_shard.config, &tenant_state.config,
false, false,
)), )),
}, },
)]); )]);
} else { } else {
tenant_shard.observed.locations.clear(); tenant_state.observed.locations.clear();
} }
} }
Ok(AttachHookResponse { Ok(AttachHookResponse {
gen: attach_req gen: attach_req
.node_id .node_id
.map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
}) })
} }
pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse {
let locked = self.inner.read().unwrap(); let locked = self.inner.read().unwrap();
let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
InspectResponse { InspectResponse {
attachment: tenant_shard.and_then(|s| { attachment: tenant_state.and_then(|s| {
s.intent s.intent
.get_attached() .get_attached()
.map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
@@ -1321,11 +1319,11 @@ impl Service {
let mut locked = self.inner.write().unwrap(); let mut locked = self.inner.write().unwrap();
for (tenant_shard_id, observed_loc) in configs.tenant_shards { for (tenant_shard_id, observed_loc) in configs.tenant_shards {
let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
cleanup.push(tenant_shard_id); cleanup.push(tenant_shard_id);
continue; continue;
}; };
tenant_shard tenant_state
.observed .observed
.locations .locations
.insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
@@ -1496,13 +1494,13 @@ impl Service {
}; };
for req_tenant in validate_req.tenants { for req_tenant in validate_req.tenants {
if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
tracing::info!( tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {:?})", "handle_validate: {}(gen {}): valid={valid} (latest {:?})",
req_tenant.id, req_tenant.id,
req_tenant.gen, req_tenant.gen,
tenant_shard.generation tenant_state.generation
); );
response.tenants.push(ValidateResponseTenant { response.tenants.push(ValidateResponseTenant {
id: req_tenant.id, id: req_tenant.id,
@@ -1617,8 +1615,6 @@ impl Service {
placement_policy: serde_json::to_string(&placement_policy).unwrap(), placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(), config: serde_json::to_string(&create_req.config).unwrap(),
splitting: SplitState::default(), splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
}) })
.collect(); .collect();
@@ -1641,8 +1637,6 @@ impl Service {
Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))),
}; };
let mut schedule_context = ScheduleContext::default();
let (waiters, response_shards) = { let (waiters, response_shards) = {
let mut locked = self.inner.write().unwrap(); let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut(); let (nodes, tenants, scheduler) = locked.parts_mut();
@@ -1664,14 +1658,11 @@ impl Service {
// attached and secondary locations (independently) away frorm those // attached and secondary locations (independently) away frorm those
// pageservers also holding a shard for this tenant. // pageservers also holding a shard for this tenant.
entry entry.get_mut().schedule(scheduler).map_err(|e| {
.get_mut() ApiError::Conflict(format!(
.schedule(scheduler, &mut schedule_context) "Failed to schedule shard {tenant_shard_id}: {e}"
.map_err(|e| { ))
ApiError::Conflict(format!( })?;
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?;
if let Some(node_id) = entry.get().intent.get_attached() { if let Some(node_id) = entry.get().intent.get_attached() {
let generation = entry let generation = entry
@@ -1688,7 +1679,7 @@ impl Service {
continue; continue;
} }
Entry::Vacant(entry) => { Entry::Vacant(entry) => {
let state = entry.insert(TenantShard::new( let state = entry.insert(TenantState::new(
tenant_shard_id, tenant_shard_id,
ShardIdentity::from_params( ShardIdentity::from_params(
tenant_shard_id.shard_number, tenant_shard_id.shard_number,
@@ -1699,7 +1690,7 @@ impl Service {
state.generation = initial_generation; state.generation = initial_generation;
state.config = create_req.config.clone(); state.config = create_req.config.clone();
if let Err(e) = state.schedule(scheduler, &mut schedule_context) { if let Err(e) = state.schedule(scheduler) {
schcedule_error = Some(e); schcedule_error = Some(e);
} }
@@ -1763,9 +1754,6 @@ impl Service {
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
/// and transform it into either a tenant creation of a series of shard updates. /// and transform it into either a tenant creation of a series of shard updates.
///
/// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will
/// still be returned.
fn tenant_location_config_prepare( fn tenant_location_config_prepare(
&self, &self,
tenant_id: TenantId, tenant_id: TenantId,
@@ -1813,12 +1801,17 @@ impl Service {
_ => None, _ => None,
}; };
updates.push(ShardUpdate { if shard.policy != placement_policy
tenant_shard_id: *shard_id, || shard.config != req.config.tenant_conf
placement_policy: placement_policy.clone(), || set_generation.is_some()
tenant_config: req.config.tenant_conf.clone(), {
generation: set_generation, updates.push(ShardUpdate {
}); tenant_shard_id: *shard_id,
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
});
}
} }
if create { if create {
@@ -1847,7 +1840,6 @@ impl Service {
}, },
) )
} else { } else {
assert!(!updates.is_empty());
TenantCreateOrUpdate::Update(updates) TenantCreateOrUpdate::Update(updates)
} }
} }
@@ -1906,7 +1898,6 @@ impl Service {
// Persist updates // Persist updates
// Ordering: write to the database before applying changes in-memory, so that // Ordering: write to the database before applying changes in-memory, so that
// we will not appear time-travel backwards on a restart. // we will not appear time-travel backwards on a restart.
let mut schedule_context = ScheduleContext::default();
for ShardUpdate { for ShardUpdate {
tenant_shard_id, tenant_shard_id,
placement_policy, placement_policy,
@@ -1916,11 +1907,10 @@ impl Service {
{ {
self.persistence self.persistence
.update_tenant_shard( .update_tenant_shard(
TenantFilter::Shard(*tenant_shard_id), *tenant_shard_id,
Some(placement_policy.clone()), placement_policy.clone(),
Some(tenant_config.clone()), tenant_config.clone(),
*generation, *generation,
None,
) )
.await?; .await?;
} }
@@ -1954,7 +1944,7 @@ impl Service {
shard.generation = Some(generation); shard.generation = Some(generation);
} }
shard.schedule(scheduler, &mut schedule_context)?; shard.schedule(scheduler)?;
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
if let Some(waiter) = maybe_waiter { if let Some(waiter) = maybe_waiter {
@@ -1998,13 +1988,7 @@ impl Service {
let config = req.config; let config = req.config;
self.persistence self.persistence
.update_tenant_shard( .update_tenant_config(req.tenant_id, config.clone())
TenantFilter::Tenant(req.tenant_id),
None,
Some(config.clone()),
None,
None,
)
.await?; .await?;
let waiters = { let waiters = {
@@ -2114,7 +2098,7 @@ impl Service {
let scheduler = &locked.scheduler; let scheduler = &locked.scheduler;
// Right now we only perform the operation on a single node without parallelization // Right now we only perform the operation on a single node without parallelization
// TODO fan out the operation to multiple nodes for better performance // TODO fan out the operation to multiple nodes for better performance
let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; let node_id = scheduler.schedule_shard(&[])?;
let node = locked let node = locked
.nodes .nodes
.get(&node_id) .get(&node_id)
@@ -2357,58 +2341,6 @@ impl Service {
Ok(StatusCode::NOT_FOUND) Ok(StatusCode::NOT_FOUND)
} }
/// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
/// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies
/// the tenant's policies (configuration) within the storage controller
pub(crate) async fn tenant_update_policy(
&self,
tenant_id: TenantId,
req: TenantPolicyRequest,
) -> Result<(), ApiError> {
// We require an exclusive lock, because we are updating persistent and in-memory state
let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
let TenantPolicyRequest {
placement,
scheduling,
} = req;
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(tenant_id),
placement.clone(),
None,
None,
scheduling,
)
.await?;
let mut schedule_context = ScheduleContext::default();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
if let Some(placement) = &placement {
shard.policy = placement.clone();
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
"Updated placement policy to {placement:?}");
}
if let Some(scheduling) = &scheduling {
shard.set_scheduling_policy(*scheduling);
tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
"Updated scheduling policy to {scheduling:?}");
}
// In case scheduling is being switched back on, try it now.
shard.schedule(scheduler, &mut schedule_context).ok();
self.maybe_reconcile_shard(shard, nodes);
}
Ok(())
}
pub(crate) async fn tenant_timeline_create( pub(crate) async fn tenant_timeline_create(
&self, &self,
tenant_id: TenantId, tenant_id: TenantId,
@@ -2735,71 +2667,45 @@ impl Service {
}) })
} }
/// Returns None if the input iterator of shards does not include a shard with number=0
fn tenant_describe_impl<'a>(
&self,
shards: impl Iterator<Item = &'a TenantShard>,
) -> Option<TenantDescribeResponse> {
let mut shard_zero = None;
let mut describe_shards = Vec::new();
for shard in shards {
if shard.tenant_shard_id.is_shard_zero() {
shard_zero = Some(shard);
}
describe_shards.push(TenantDescribeResponseShard {
tenant_shard_id: shard.tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
scheduling_policy: *shard.get_scheduling_policy(),
})
}
let shard_zero = shard_zero?;
Some(TenantDescribeResponse {
tenant_id: shard_zero.tenant_shard_id.tenant_id,
shards: describe_shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
}
pub(crate) fn tenant_describe( pub(crate) fn tenant_describe(
&self, &self,
tenant_id: TenantId, tenant_id: TenantId,
) -> Result<TenantDescribeResponse, ApiError> { ) -> Result<TenantDescribeResponse, ApiError> {
let locked = self.inner.read().unwrap(); let locked = self.inner.read().unwrap();
self.tenant_describe_impl( let mut shard_zero = None;
locked let mut shards = Vec::new();
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.map(|(_k, v)| v),
)
.ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
}
pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> { for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id))
let locked = self.inner.read().unwrap();
let mut result = Vec::new();
for (_tenant_id, tenant_shards) in
&locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
{ {
result.push( if tenant_shard_id.is_zero() {
self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) shard_zero = Some(shard);
.expect("Groups are always non-empty"), }
);
let response_shard = TenantDescribeResponseShard {
tenant_shard_id: *tenant_shard_id,
node_attached: *shard.intent.get_attached(),
node_secondary: shard.intent.get_secondary().to_vec(),
last_error: shard.last_error.lock().unwrap().clone(),
is_reconciling: shard.reconciler.is_some(),
is_pending_compute_notification: shard.pending_compute_notification,
is_splitting: matches!(shard.splitting, SplitState::Splitting),
};
shards.push(response_shard);
} }
result let Some(shard_zero) = shard_zero else {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
));
};
Ok(TenantDescribeResponse {
shards,
stripe_size: shard_zero.shard.stripe_size,
policy: shard_zero.policy.clone(),
config: shard_zero.config.clone(),
})
} }
#[instrument(skip_all, fields(tenant_id=%op.tenant_id))] #[instrument(skip_all, fields(tenant_id=%op.tenant_id))]
@@ -2892,7 +2798,7 @@ impl Service {
tracing::info!("Restoring parent shard {tenant_shard_id}"); tracing::info!("Restoring parent shard {tenant_shard_id}");
shard.splitting = SplitState::Idle; shard.splitting = SplitState::Idle;
if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { if let Err(e) = shard.schedule(scheduler) {
// If this shard can't be scheduled now (perhaps due to offline nodes or // If this shard can't be scheduled now (perhaps due to offline nodes or
// capacity issues), that must not prevent us rolling back a split. In this // capacity issues), that must not prevent us rolling back a split. In this
// case it should be eventually scheduled in the background. // case it should be eventually scheduled in the background.
@@ -3016,7 +2922,6 @@ impl Service {
) )
}; };
let mut schedule_context = ScheduleContext::default();
for child in child_ids { for child in child_ids {
let mut child_shard = parent_ident; let mut child_shard = parent_ident;
child_shard.number = child.shard_number; child_shard.number = child.shard_number;
@@ -3038,7 +2943,7 @@ impl Service {
}, },
); );
let mut child_state = TenantShard::new(child, child_shard, policy.clone()); let mut child_state = TenantState::new(child, child_shard, policy.clone());
child_state.intent = IntentState::single(scheduler, Some(pageserver)); child_state.intent = IntentState::single(scheduler, Some(pageserver));
child_state.observed = ObservedState { child_state.observed = ObservedState {
locations: child_observed, locations: child_observed,
@@ -3046,13 +2951,13 @@ impl Service {
child_state.generation = Some(generation); child_state.generation = Some(generation);
child_state.config = config.clone(); child_state.config = config.clone();
// The child's TenantShard::splitting is intentionally left at the default value of Idle, // The child's TenantState::splitting is intentionally left at the default value of Idle,
// as at this point in the split process we have succeeded and this part is infallible: // as at this point in the split process we have succeeded and this part is infallible:
// we will never need to do any special recovery from this state. // we will never need to do any special recovery from this state.
child_locations.push((child, pageserver, child_shard.stripe_size)); child_locations.push((child, pageserver, child_shard.stripe_size));
if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) { if let Err(e) = child_state.schedule(scheduler) {
// This is not fatal, because we've implicitly already got an attached // This is not fatal, because we've implicitly already got an attached
// location for the child shard. Failure here just means we couldn't // location for the child shard. Failure here just means we couldn't
// find a secondary (e.g. because cluster is overloaded). // find a secondary (e.g. because cluster is overloaded).
@@ -3345,10 +3250,6 @@ impl Service {
placement_policy: serde_json::to_string(&policy).unwrap(), placement_policy: serde_json::to_string(&policy).unwrap(),
config: serde_json::to_string(&config).unwrap(), config: serde_json::to_string(&config).unwrap(),
splitting: SplitState::Splitting, splitting: SplitState::Splitting,
// Scheduling policies do not carry through to children
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
}); });
} }
@@ -3595,8 +3496,8 @@ impl Service {
Ok(()) Ok(())
} }
/// For debug/support: a full JSON dump of TenantShards. Returns a response so that /// For debug/support: a full JSON dump of TenantStates. Returns a response so that
/// we don't have to make TenantShard clonable in the return path. /// we don't have to make TenantState clonable in the return path.
pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> { pub(crate) fn tenants_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
let serialized = { let serialized = {
let locked = self.inner.read().unwrap(); let locked = self.inner.read().unwrap();
@@ -3700,7 +3601,7 @@ impl Service {
} }
/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
/// we don't have to make TenantShard clonable in the return path. /// we don't have to make TenantState clonable in the return path.
pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> { pub(crate) fn scheduler_dump(&self) -> Result<hyper::Response<hyper::Body>, ApiError> {
let serialized = { let serialized = {
let locked = self.inner.read().unwrap(); let locked = self.inner.read().unwrap();
@@ -3916,9 +3817,8 @@ impl Service {
AvailabilityTransition::ToOffline => { AvailabilityTransition::ToOffline => {
tracing::info!("Node {} transition to offline", node_id); tracing::info!("Node {} transition to offline", node_id);
let mut tenants_affected: usize = 0; let mut tenants_affected: usize = 0;
for (tenant_shard_id, tenant_state) in tenants {
for (tenant_shard_id, tenant_shard) in tenants { if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
// When a node goes offline, we set its observed configuration to None, indicating unknown: we will // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
// not assume our knowledge of the node's configuration is accurate until it comes back online // not assume our knowledge of the node's configuration is accurate until it comes back online
observed_loc.conf = None; observed_loc.conf = None;
@@ -3931,24 +3831,18 @@ impl Service {
continue; continue;
} }
if tenant_shard.intent.demote_attached(node_id) { if tenant_state.intent.demote_attached(node_id) {
tenant_shard.sequence = tenant_shard.sequence.next(); tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {
// TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
// for tenants without secondary locations: if they have a secondary location, then this
// schedule() call is just promoting an existing secondary)
let mut schedule_context = ScheduleContext::default();
match tenant_shard.schedule(scheduler, &mut schedule_context) {
Err(e) => { Err(e) => {
// It is possible that some tenants will become unschedulable when too many pageservers // It is possible that some tenants will become unschedulable when too many pageservers
// go offline: in this case there isn't much we can do other than make the issue observable. // go offline: in this case there isn't much we can do other than make the issue observable.
// TODO: give TenantShard a scheduling error attribute to be queried later. // TODO: give TenantState a scheduling error attribute to be queried later.
tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
} }
Ok(()) => { Ok(()) => {
if self if self
.maybe_reconcile_shard(tenant_shard, &new_nodes) .maybe_reconcile_shard(tenant_state, &new_nodes)
.is_some() .is_some()
{ {
tenants_affected += 1; tenants_affected += 1;
@@ -3967,10 +3861,10 @@ impl Service {
tracing::info!("Node {} transition to active", node_id); tracing::info!("Node {} transition to active", node_id);
// When a node comes back online, we must reconcile any tenant that has a None observed // When a node comes back online, we must reconcile any tenant that has a None observed
// location on the node. // location on the node.
for tenant_shard in locked.tenants.values_mut() { for tenant_state in locked.tenants.values_mut() {
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) {
if observed_loc.conf.is_none() { if observed_loc.conf.is_none() {
self.maybe_reconcile_shard(tenant_shard, &new_nodes); self.maybe_reconcile_shard(tenant_state, &new_nodes);
} }
} }
} }
@@ -3990,6 +3884,9 @@ impl Service {
/// Helper for methods that will try and call pageserver APIs for /// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere. /// is attached somewhere.
///
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
/// an attached policy. We should error out if it isn't.
fn ensure_attached_schedule( fn ensure_attached_schedule(
&self, &self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3998,27 +3895,10 @@ impl Service {
let mut waiters = Vec::new(); let mut waiters = Vec::new();
let (nodes, tenants, scheduler) = locked.parts_mut(); let (nodes, tenants, scheduler) = locked.parts_mut();
let mut schedule_context = ScheduleContext::default(); for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { shard.schedule(scheduler)?;
shard.schedule(scheduler, &mut schedule_context)?;
// The shard's policies may not result in an attached location being scheduled: this
// is an error because our caller needs it attached somewhere.
if shard.intent.get_attached().is_none() {
return Err(anyhow::anyhow!(
"Tenant {tenant_id} not scheduled to be attached"
));
};
if shard.stably_attached().is_some() {
// We do not require the shard to be totally up to date on reconciliation: we just require
// that it has been attached on the intended node. Other dirty state such as unattached secondary
// locations, or compute hook notifications can be ignored.
continue;
}
if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
waiters.push(waiter); waiters.push(waiter);
} }
} }
@@ -4053,11 +3933,11 @@ impl Service {
Ok(()) Ok(())
} }
/// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides
/// all the references to parts of Self that are needed /// all the references to parts of Self that are needed
fn maybe_reconcile_shard( fn maybe_reconcile_shard(
&self, &self,
shard: &mut TenantShard, shard: &mut TenantState,
nodes: &Arc<HashMap<NodeId, Node>>, nodes: &Arc<HashMap<NodeId, Node>>,
) -> Option<ReconcilerWaiter> { ) -> Option<ReconcilerWaiter> {
shard.maybe_reconcile( shard.maybe_reconcile(
@@ -4080,145 +3960,8 @@ impl Service {
let (nodes, tenants, _scheduler) = locked.parts_mut(); let (nodes, tenants, _scheduler) = locked.parts_mut();
let pageservers = nodes.clone(); let pageservers = nodes.clone();
let mut schedule_context = ScheduleContext::default();
let mut reconciles_spawned = 0; let mut reconciles_spawned = 0;
for (tenant_shard_id, shard) in tenants.iter_mut() { for (_tenant_shard_id, shard) in tenants.iter_mut() {
if tenant_shard_id.is_shard_zero() {
schedule_context = ScheduleContext::default();
}
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another rone
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
reconciles_spawned += 1;
}
schedule_context.avoid(&shard.intent.all_pageservers());
}
reconciles_spawned
}
/// `optimize` in this context means identifying shards which have valid scheduled locations, but
/// could be scheduled somewhere better:
/// - Cutting over to a secondary if the node with the secondary is more lightly loaded
/// * e.g. after a node fails then recovers, to move some work back to it
/// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant
/// * e.g. after a shard split, the initial attached locations will all be on the node where
/// we did the split, but are probably better placed elsewhere.
/// - Creating new secondary locations if it improves the spreading of a sharded tenant
/// * e.g. after a shard split, some locations will be on the same node (where the split
/// happened), and will probably be better placed elsewhere.
///
/// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at
/// the time of scheduling, this function looks for cases where a better-scoring location is available
/// according to those same soft constraints.
fn optimize_all(&self) -> usize {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let pageservers = nodes.clone();
let mut schedule_context = ScheduleContext::default();
let mut reconciles_spawned = 0;
let mut tenant_shards: Vec<&TenantShard> = Vec::new();
// Limit on how many shards' optmizations each call to this function will execute. Combined
// with the frequency of background calls, this acts as an implicit rate limit that runs a small
// trickle of optimizations in the background, rather than executing a large number in parallel
// when a change occurs.
const MAX_OPTIMIZATIONS_PER_PASS: usize = 2;
let mut work = Vec::new();
for (tenant_shard_id, shard) in tenants.iter() {
if tenant_shard_id.is_shard_zero() {
// Reset accumulators on the first shard in a tenant
schedule_context = ScheduleContext::default();
schedule_context.mode = ScheduleMode::Speculative;
tenant_shards.clear();
}
if work.len() >= MAX_OPTIMIZATIONS_PER_PASS {
break;
}
match shard.get_scheduling_policy() {
ShardSchedulingPolicy::Active => {
// Ok to do optimization
}
ShardSchedulingPolicy::Essential
| ShardSchedulingPolicy::Pause
| ShardSchedulingPolicy::Stop => {
// Policy prevents optimizing this shard.
continue;
}
}
// Accumulate the schedule context for all the shards in a tenant: we must have
// the total view of all shards before we can try to optimize any of them.
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
tenant_shards.push(shard);
// Once we have seen the last shard in the tenant, proceed to search across all shards
// in the tenant for optimizations
if shard.shard.number.0 == shard.shard.count.count() - 1 {
if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
// Do not start any optimizations while another change to the tenant is ongoing: this
// is not necessary for correctness, but simplifies operations and implicitly throttles
// optimization changes to happen in a "trickle" over time.
continue;
}
if tenant_shards.iter().any(|s| {
!matches!(s.splitting, SplitState::Idle)
|| matches!(s.policy, PlacementPolicy::Detached)
}) {
// Never attempt to optimize a tenant that is currently being split, or
// a tenant that is meant to be detached
continue;
}
// TODO: optimization calculations are relatively expensive: create some fast-path for
// the common idle case (avoiding the search on tenants that we have recently checked)
for shard in &tenant_shards {
if let Some(optimization) =
// If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
// its primary location based on soft constraints, cut it over.
shard.optimize_attachment(nodes, &schedule_context)
{
work.push((shard.tenant_shard_id, optimization));
break;
} else if let Some(optimization) =
// If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
// better placed on another node, based on ScheduleContext, then adjust it. This
// covers cases like after a shard split, where we might have too many shards
// in the same tenant with secondary locations on the node where they originally split.
shard.optimize_secondary(scheduler, &schedule_context)
{
work.push((shard.tenant_shard_id, optimization));
break;
}
// TODO: extend this mechanism to prefer attaching on nodes with fewer attached
// tenants (i.e. extend schedule state to distinguish attached from secondary counts),
// for the total number of attachments on a node (not just within a tenant.)
}
}
}
for (tenant_shard_id, optimization) in work {
let shard = tenants
.get_mut(&tenant_shard_id)
.expect("We held lock from place we got this ID");
shard.apply_optimization(scheduler, optimization);
if self.maybe_reconcile_shard(shard, &pageservers).is_some() { if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
reconciles_spawned += 1; reconciles_spawned += 1;
} }
@@ -4227,35 +3970,9 @@ impl Service {
reconciles_spawned reconciles_spawned
} }
/// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
/// also wait for any generated Reconcilers to complete. Calling this until it returns zero should
/// put the system into a quiescent state where future background reconciliations won't do anything.
pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
let reconciles_spawned = self.reconcile_all();
if reconciles_spawned == 0 {
// Only optimize when we are otherwise idle
self.optimize_all();
}
let waiters = {
let mut waiters = Vec::new();
let locked = self.inner.read().unwrap();
for (_tenant_shard_id, shard) in locked.tenants.iter() {
if let Some(waiter) = shard.get_waiter() {
waiters.push(waiter);
}
}
waiters
};
let waiter_count = waiters.len();
self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
Ok(waiter_count)
}
pub async fn shutdown(&self) { pub async fn shutdown(&self) {
// Note that this already stops processing any results from reconciles: so // Note that this already stops processing any results from reconciles: so
// we do not expect that our [`TenantShard`] objects will reach a neat // we do not expect that our [`TenantState`] objects will reach a neat
// final state. // final state.
self.cancel.cancel(); self.cancel.cancel();

View File

@@ -7,9 +7,8 @@ use std::{
use crate::{ use crate::{
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
persistence::TenantShardPersistence, persistence::TenantShardPersistence,
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
}; };
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::{ use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig}, models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId}, shard::{ShardIdentity, TenantShardId},
@@ -50,7 +49,7 @@ where
/// This struct implement Serialize for debugging purposes, but is _not_ persisted /// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)] #[derive(Serialize)]
pub(crate) struct TenantShard { pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId, pub(crate) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity, pub(crate) shard: ShardIdentity,
@@ -117,10 +116,6 @@ pub(crate) struct TenantShard {
/// sending it. This is the mechanism by which compute notifications are included in the scope /// sending it. This is the mechanism by which compute notifications are included in the scope
/// of state that we publish externally in an eventually consistent way. /// of state that we publish externally in an eventually consistent way.
pub(crate) pending_compute_notification: bool, pub(crate) pending_compute_notification: bool,
// Support/debug tool: if something is going wrong or flapping with scheduling, this may
// be set to a non-active state to avoid making changes while the issue is fixed.
scheduling_policy: ShardSchedulingPolicy,
} }
#[derive(Default, Clone, Debug, Serialize)] #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +246,8 @@ impl IntentState {
impl Drop for IntentState { impl Drop for IntentState {
fn drop(&mut self) { fn drop(&mut self) {
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
// We do not check this while panicking, to avoid polluting unit test failures or debug_assert!(self.attached.is_none() && self.secondary.is_empty());
// other assertions with this assertion's output. It's still wrong to leak these,
// but if we already have a panic then we don't need to independently flag this case.
if !(std::thread::panicking()) {
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
}
} }
} }
@@ -302,26 +292,6 @@ pub enum ReconcileWaitError {
Failed(TenantShardId, String), Failed(TenantShardId, String),
} }
#[derive(Eq, PartialEq, Debug)]
pub(crate) struct ReplaceSecondary {
old_node_id: NodeId,
new_node_id: NodeId,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) struct MigrateAttachment {
old_attached_node_id: NodeId,
new_attached_node_id: NodeId,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) enum ScheduleOptimization {
// Replace one of our secondary locations with a different node
ReplaceSecondary(ReplaceSecondary),
// Migrate attachment to an existing secondary location
MigrateAttachment(MigrateAttachment),
}
impl ReconcilerWaiter { impl ReconcilerWaiter {
pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
tokio::select! { tokio::select! {
@@ -354,7 +324,7 @@ pub(crate) struct ReconcilerHandle {
} }
/// When a reconcile task completes, it sends this result object /// When a reconcile task completes, it sends this result object
/// to be applied to the primary TenantShard. /// to be applied to the primary TenantState.
pub(crate) struct ReconcileResult { pub(crate) struct ReconcileResult {
pub(crate) sequence: Sequence, pub(crate) sequence: Sequence,
/// On errors, `observed` should be treated as an incompleted description /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +337,7 @@ pub(crate) struct ReconcileResult {
pub(crate) generation: Option<Generation>, pub(crate) generation: Option<Generation>,
pub(crate) observed: ObservedState, pub(crate) observed: ObservedState,
/// Set [`TenantShard::pending_compute_notification`] from this flag /// Set [`TenantState::pending_compute_notification`] from this flag
pub(crate) pending_compute_notification: bool, pub(crate) pending_compute_notification: bool,
} }
@@ -379,7 +349,7 @@ impl ObservedState {
} }
} }
impl TenantShard { impl TenantState {
pub(crate) fn new( pub(crate) fn new(
tenant_shard_id: TenantShardId, tenant_shard_id: TenantShardId,
shard: ShardIdentity, shard: ShardIdentity,
@@ -400,7 +370,6 @@ impl TenantShard {
error_waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(), last_error: Arc::default(),
pending_compute_notification: false, pending_compute_notification: false,
scheduling_policy: ShardSchedulingPolicy::default(),
} }
} }
@@ -456,7 +425,6 @@ impl TenantShard {
fn schedule_attached( fn schedule_attached(
&mut self, &mut self,
scheduler: &mut Scheduler, scheduler: &mut Scheduler,
context: &ScheduleContext,
) -> Result<(bool, NodeId), ScheduleError> { ) -> Result<(bool, NodeId), ScheduleError> {
// No work to do if we already have an attached tenant // No work to do if we already have an attached tenant
if let Some(node_id) = self.intent.attached { if let Some(node_id) = self.intent.attached {
@@ -470,33 +438,14 @@ impl TenantShard {
Ok((true, promote_secondary)) Ok((true, promote_secondary))
} else { } else {
// Pick a fresh node: either we had no secondaries or none were schedulable // Pick a fresh node: either we had no secondaries or none were schedulable
let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?; let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
tracing::debug!("Selected {} as attached", node_id); tracing::debug!("Selected {} as attached", node_id);
self.intent.set_attached(scheduler, Some(node_id)); self.intent.set_attached(scheduler, Some(node_id));
Ok((true, node_id)) Ok((true, node_id))
} }
} }
pub(crate) fn schedule( pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
&mut self,
scheduler: &mut Scheduler,
context: &mut ScheduleContext,
) -> Result<(), ScheduleError> {
let r = self.do_schedule(scheduler, context);
context.avoid(&self.intent.all_pageservers());
if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
r
}
pub(crate) fn do_schedule(
&mut self,
scheduler: &mut Scheduler,
context: &ScheduleContext,
) -> Result<(), ScheduleError> {
// TODO: before scheduling new nodes, check if any existing content in // TODO: before scheduling new nodes, check if any existing content in
// self.intent refers to pageservers that are offline, and pick other // self.intent refers to pageservers that are offline, and pick other
// pageservers if so. // pageservers if so.
@@ -504,16 +453,6 @@ impl TenantShard {
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
// change their attach location. // change their attach location.
match self.scheduling_policy {
ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
// Warn to make it obvious why other things aren't happening/working, if we skip scheduling
tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
"Scheduling is disabled by policy {:?}", self.scheduling_policy);
return Ok(());
}
}
// Build the set of pageservers already in use by this tenant, to avoid scheduling // Build the set of pageservers already in use by this tenant, to avoid scheduling
// more work on the same pageservers we're already using. // more work on the same pageservers we're already using.
let mut modified = false; let mut modified = false;
@@ -540,13 +479,12 @@ impl TenantShard {
} }
// Should have exactly one attached, and N secondaries // Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) = let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
self.schedule_attached(scheduler, context)?;
modified |= modified_attached; modified |= modified_attached;
let mut used_pageservers = vec![attached_node_id]; let mut used_pageservers = vec![attached_node_id];
while self.intent.secondary.len() < secondary_count { while self.intent.secondary.len() < secondary_count {
let node_id = scheduler.schedule_shard(&used_pageservers, context)?; let node_id = scheduler.schedule_shard(&used_pageservers)?;
self.intent.push_secondary(scheduler, node_id); self.intent.push_secondary(scheduler, node_id);
used_pageservers.push(node_id); used_pageservers.push(node_id);
modified = true; modified = true;
@@ -559,7 +497,7 @@ impl TenantShard {
modified = true; modified = true;
} else if self.intent.secondary.is_empty() { } else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node // Populate secondary by scheduling a fresh node
let node_id = scheduler.schedule_shard(&[], context)?; let node_id = scheduler.schedule_shard(&[])?;
self.intent.push_secondary(scheduler, node_id); self.intent.push_secondary(scheduler, node_id);
modified = true; modified = true;
} }
@@ -586,167 +524,6 @@ impl TenantShard {
Ok(()) Ok(())
} }
/// Optimize attachments: if a shard has a secondary location that is preferable to
/// its primary location based on soft constraints, switch that secondary location
/// to be attached.
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn optimize_attachment(
&self,
nodes: &HashMap<NodeId, Node>,
schedule_context: &ScheduleContext,
) -> Option<ScheduleOptimization> {
let attached = (*self.intent.get_attached())?;
if self.intent.secondary.is_empty() {
// We can only do useful work if we have both attached and secondary locations: this
// function doesn't schedule new locations, only swaps between attached and secondaries.
return None;
}
let current_affinity_score = schedule_context.get_node_affinity(attached);
let current_attachment_count = schedule_context.get_node_attachments(attached);
// Generate score for each node, dropping any un-schedulable nodes.
let all_pageservers = self.intent.all_pageservers();
let mut scores = all_pageservers
.iter()
.flat_map(|node_id| {
if matches!(
nodes
.get(node_id)
.map(|n| n.may_schedule())
.unwrap_or(MaySchedule::No),
MaySchedule::No
) {
None
} else {
let affinity_score = schedule_context.get_node_affinity(*node_id);
let attachment_count = schedule_context.get_node_attachments(*node_id);
Some((*node_id, affinity_score, attachment_count))
}
})
.collect::<Vec<_>>();
// Sort precedence:
// 1st - prefer nodes with the lowest total affinity score
// 2nd - prefer nodes with the lowest number of attachments in this context
// 3rd - if all else is equal, sort by node ID for determinism in tests.
scores.sort_by_key(|i| (i.1, i.2, i.0));
if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
scores.first()
{
if attached != *preferred_node {
// The best alternative must be more than 1 better than us, otherwise we could end
// up flapping back next time we're called (e.g. there's no point migrating from
// a location with score 1 to a score zero, because on next location the situation
// would be the same, but in reverse).
if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
|| current_attachment_count > *preferred_attachment_count + 1
{
tracing::info!(
"Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
self.intent.get_secondary()
);
return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: attached,
new_attached_node_id: *preferred_node,
}));
}
} else {
tracing::debug!(
"Node {} is already preferred (score {:?})",
preferred_node,
preferred_affinity_score
);
}
}
// Fall-through: we didn't find an optimization
None
}
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn optimize_secondary(
&self,
scheduler: &Scheduler,
schedule_context: &ScheduleContext,
) -> Option<ScheduleOptimization> {
if self.intent.secondary.is_empty() {
// We can only do useful work if we have both attached and secondary locations: this
// function doesn't schedule new locations, only swaps between attached and secondaries.
return None;
}
for secondary in self.intent.get_secondary() {
let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
// We're already on a node unaffected any affinity constraints,
// so we won't change it.
continue;
};
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
// This implicitly limits the choice to nodes that are available, and prefers nodes
// with lower utilization.
let Ok(candidate_node) =
scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
else {
// A scheduling error means we have no possible candidate replacements
continue;
};
let candidate_affinity_score = schedule_context
.nodes
.get(&candidate_node)
.unwrap_or(&AffinityScore::FREE);
// The best alternative must be more than 1 better than us, otherwise we could end
// up flapping back next time we're called.
if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
// If some other node is available and has a lower score than this node, then
// that other node is a good place to migrate to.
tracing::info!(
"Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
self.intent.get_secondary()
);
return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id: *secondary,
new_node_id: candidate_node,
}));
}
}
None
}
pub(crate) fn apply_optimization(
&mut self,
scheduler: &mut Scheduler,
optimization: ScheduleOptimization,
) {
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_schedule_optimization
.inc();
match optimization {
ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id,
new_attached_node_id,
}) => {
self.intent.demote_attached(old_attached_node_id);
self.intent
.promote_attached(scheduler, new_attached_node_id);
}
ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id,
new_node_id,
}) => {
self.intent.remove_secondary(scheduler, old_node_id);
self.intent.push_secondary(scheduler, new_node_id);
}
}
}
/// Query whether the tenant's observed state for attached node matches its intent state, and if so, /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -891,19 +668,6 @@ impl TenantShard {
} }
} }
// Pre-checks done: finally check whether we may actually do the work
match self.scheduling_policy {
ShardSchedulingPolicy::Active
| ShardSchedulingPolicy::Essential
| ShardSchedulingPolicy::Pause => {}
ShardSchedulingPolicy::Stop => {
// We only reach this point if there is work to do and we're going to skip
// doing it: warn it obvious why this tenant isn't doing what it ought to.
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
return None;
}
}
// Build list of nodes from which the reconciler should detach // Build list of nodes from which the reconciler should detach
let mut detach = Vec::new(); let mut detach = Vec::new();
for node_id in self.observed.locations.keys() { for node_id in self.observed.locations.keys() {
@@ -1040,22 +804,6 @@ impl TenantShard {
}) })
} }
/// Get a waiter for any reconciliation in flight, but do not start reconciliation
/// if it is not already running
pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
if self.reconciler.is_some() {
Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
error_seq_wait: self.error_waiter.clone(),
error: self.last_error.clone(),
seq: self.sequence,
})
} else {
None
}
}
/// Called when a ReconcileResult has been emitted and the service is updating /// Called when a ReconcileResult has been emitted and the service is updating
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
/// the handle to indicate there is no longer a reconciliation in progress. /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +829,6 @@ impl TenantShard {
debug_assert!(!self.intent.all_pageservers().contains(&node_id)); debug_assert!(!self.intent.all_pageservers().contains(&node_id));
} }
pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
self.scheduling_policy = p;
}
pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
&self.scheduling_policy
}
pub(crate) fn from_persistent(
tsp: TenantShardPersistence,
intent: IntentState,
) -> anyhow::Result<Self> {
let tenant_shard_id = tsp.get_tenant_shard_id()?;
let shard_identity = tsp.get_shard_identity()?;
Ok(Self {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
config: serde_json::from_str(&tsp.config).unwrap(),
reconciler: None,
splitting: tsp.splitting,
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
})
}
pub(crate) fn to_persistent(&self) -> TenantShardPersistence { pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
TenantShardPersistence { TenantShardPersistence {
tenant_id: self.tenant_shard_id.tenant_id.to_string(), tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +840,6 @@ impl TenantShard {
placement_policy: serde_json::to_string(&self.policy).unwrap(), placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(), config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(), splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
} }
} }
} }
@@ -1143,7 +856,7 @@ pub(crate) mod tests {
use super::*; use super::*;
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
let tenant_id = TenantId::generate(); let tenant_id = TenantId::generate();
let shard_number = ShardNumber(0); let shard_number = ShardNumber(0);
let shard_count = ShardCount::new(1); let shard_count = ShardCount::new(1);
@@ -1153,7 +866,7 @@ pub(crate) mod tests {
shard_number, shard_number,
shard_count, shard_count,
}; };
TenantShard::new( TenantState::new(
tenant_shard_id, tenant_shard_id,
ShardIdentity::new( ShardIdentity::new(
shard_number, shard_number,
@@ -1165,32 +878,6 @@ pub(crate) mod tests {
) )
} }
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
let tenant_id = TenantId::generate();
(0..shard_count.count())
.map(|i| {
let shard_number = ShardNumber(i);
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number,
shard_count,
};
TenantShard::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
shard_count,
pageserver_api::shard::ShardStripeSize(32768),
)
.unwrap(),
policy.clone(),
)
})
.collect()
}
/// Test the scheduling behaviors used when a tenant configured for HA is subject /// Test the scheduling behaviors used when a tenant configured for HA is subject
/// to nodes being marked offline. /// to nodes being marked offline.
#[test] #[test]
@@ -1200,26 +887,25 @@ pub(crate) mod tests {
let mut nodes = make_test_nodes(3); let mut nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values()); let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard tenant_state
.schedule(&mut scheduler, &mut context) .schedule(&mut scheduler)
.expect("we have enough nodes, scheduling should work"); .expect("we have enough nodes, scheduling should work");
// Expect to initially be schedule on to different nodes // Expect to initially be schedule on to different nodes
assert_eq!(tenant_shard.intent.secondary.len(), 1); assert_eq!(tenant_state.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_some()); assert!(tenant_state.intent.attached.is_some());
let attached_node_id = tenant_shard.intent.attached.unwrap(); let attached_node_id = tenant_state.intent.attached.unwrap();
let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
assert_ne!(attached_node_id, secondary_node_id); assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary // Notifying the attached node is offline should demote it to a secondary
let changed = tenant_shard.intent.demote_attached(attached_node_id); let changed = tenant_state.intent.demote_attached(attached_node_id);
assert!(changed); assert!(changed);
assert!(tenant_shard.intent.attached.is_none()); assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_shard.intent.secondary.len(), 2); assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline // Update the scheduler state to indicate the node is offline
nodes nodes
@@ -1229,18 +915,18 @@ pub(crate) mod tests {
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
// Scheduling the node should promote the still-available secondary node to attached // Scheduling the node should promote the still-available secondary node to attached
tenant_shard tenant_state
.schedule(&mut scheduler, &mut context) .schedule(&mut scheduler)
.expect("active nodes are available"); .expect("active nodes are available");
assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
// The original attached node should have been retained as a secondary // The original attached node should have been retained as a secondary
assert_eq!( assert_eq!(
*tenant_shard.intent.secondary.iter().last().unwrap(), *tenant_state.intent.secondary.iter().last().unwrap(),
attached_node_id attached_node_id
); );
tenant_shard.intent.clear(&mut scheduler); tenant_state.intent.clear(&mut scheduler);
Ok(()) Ok(())
} }
@@ -1250,263 +936,48 @@ pub(crate) mod tests {
let nodes = make_test_nodes(3); let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values()); let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard.observed.locations.insert( tenant_state.observed.locations.insert(
NodeId(3), NodeId(3),
ObservedStateLocation { ObservedStateLocation {
conf: Some(LocationConfig { conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedMulti, mode: LocationConfigMode::AttachedMulti,
generation: Some(2), generation: Some(2),
secondary_conf: None, secondary_conf: None,
shard_number: tenant_shard.shard.number.0, shard_number: tenant_state.shard.number.0,
shard_count: tenant_shard.shard.count.literal(), shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0, shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(), tenant_conf: TenantConfig::default(),
}), }),
}, },
); );
tenant_shard.observed.locations.insert( tenant_state.observed.locations.insert(
NodeId(2), NodeId(2),
ObservedStateLocation { ObservedStateLocation {
conf: Some(LocationConfig { conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedStale, mode: LocationConfigMode::AttachedStale,
generation: Some(1), generation: Some(1),
secondary_conf: None, secondary_conf: None,
shard_number: tenant_shard.shard.number.0, shard_number: tenant_state.shard.number.0,
shard_count: tenant_shard.shard.count.literal(), shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0, shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(), tenant_conf: TenantConfig::default(),
}), }),
}, },
); );
tenant_shard.intent_from_observed(&mut scheduler); tenant_state.intent_from_observed(&mut scheduler);
// The highest generationed attached location gets used as attached // The highest generationed attached location gets used as attached
assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
// Other locations get used as secondary // Other locations get used as secondary
assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
tenant_shard.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn scheduling_mode() -> anyhow::Result<()> {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
// In pause mode, schedule() shouldn't do anything
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
assert!(tenant_shard
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(tenant_shard.intent.all_pageservers().is_empty());
// In active mode, schedule() works
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
assert!(tenant_shard
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(!tenant_shard.intent.all_pageservers().is_empty());
tenant_shard.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn optimize_attachment() -> anyhow::Result<()> {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
// Initially: both nodes attached on shard 1, and both have secondary locations
// on different nodes.
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
// Either shard should recognize that it has the option to switch to a secondary location where there
// would be no other shards from the same tenant, and request to do so.
assert_eq!(
optimization_a,
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: NodeId(1),
new_attached_node_id: NodeId(2)
}))
);
// Note that these optimizing two shards in the same tenant with the same ScheduleContext is
// mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
// of [`Service::optimize_all`] to avoid trying
// to do optimizations for multiple shards in the same tenant at the same time. Generating
// both optimizations is just done for test purposes
let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
assert_eq!(
optimization_b,
Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
old_attached_node_id: NodeId(1),
new_attached_node_id: NodeId(3)
}))
);
// Applying these optimizations should result in the end state proposed
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
shard_a.intent.clear(&mut scheduler);
shard_b.intent.clear(&mut scheduler);
Ok(())
}
#[test]
fn optimize_secondary() -> anyhow::Result<()> {
let nodes = make_test_nodes(4);
let mut scheduler = Scheduler::new(nodes.values());
let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
// Initially: both nodes attached on shard 1, and both have secondary locations
// on different nodes.
shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
// Since there is a node with no locations available, the node with two locations for the
// same tenant should generate an optimization to move one away
assert_eq!(
optimization_a,
Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
old_node_id: NodeId(3),
new_node_id: NodeId(4)
}))
);
shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
shard_a.intent.clear(&mut scheduler);
shard_b.intent.clear(&mut scheduler);
Ok(())
}
// Optimize til quiescent: this emulates what Service::optimize_all does, when
// called repeatedly in the background.
fn optimize_til_idle(
nodes: &HashMap<NodeId, Node>,
scheduler: &mut Scheduler,
shards: &mut [TenantShard],
) {
let mut loop_n = 0;
loop {
let mut schedule_context = ScheduleContext::default();
let mut any_changed = false;
for shard in shards.iter() {
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
}
for shard in shards.iter_mut() {
let optimization = shard.optimize_attachment(nodes, &schedule_context);
if let Some(optimization) = optimization {
shard.apply_optimization(scheduler, optimization);
any_changed = true;
break;
}
let optimization = shard.optimize_secondary(scheduler, &schedule_context);
if let Some(optimization) = optimization {
shard.apply_optimization(scheduler, optimization);
any_changed = true;
break;
}
}
if !any_changed {
break;
}
// Assert no infinite loop
loop_n += 1;
assert!(loop_n < 1000);
}
}
/// Test the balancing behavior of shard scheduling: that it achieves a balance, and
/// that it converges.
#[test]
fn optimize_add_nodes() -> anyhow::Result<()> {
let nodes = make_test_nodes(4);
// Only show the scheduler a couple of nodes
let mut scheduler = Scheduler::new([].iter());
scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
let mut schedule_context = ScheduleContext::default();
for shard in &mut shards {
assert!(shard
.schedule(&mut scheduler, &mut schedule_context)
.is_ok());
}
// We should see equal number of locations on the two nodes.
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
// Add another two nodes: we should see the shards spread out when their optimize
// methods are called
scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
optimize_til_idle(&nodes, &mut scheduler, &mut shards);
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
for shard in shards.iter_mut() {
shard.intent.clear(&mut scheduler);
}
tenant_state.intent.clear(&mut scheduler);
Ok(()) Ok(())
} }
} }

View File

@@ -86,10 +86,7 @@ where
.stdout(process_log_file) .stdout(process_log_file)
.stderr(same_file_for_stderr) .stderr(same_file_for_stderr)
.args(args); .args(args);
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
filled_cmd.envs(envs); filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file { let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd cmd
} }
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and /// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock) /// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode; use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController; use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env}; use control_plane::{broker, local_env};
use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
};
use pageserver_api::models::{ use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
}; };
@@ -1058,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
} }
} }
Some(("set-state", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
let scheduling = subcommand_args.get_one("scheduling");
let availability = subcommand_args.get_one("availability");
let storage_controller = StorageController::from_env(env);
storage_controller
.node_configure(NodeConfigureRequest {
node_id: pageserver.conf.id,
scheduling: scheduling.cloned(),
availability: availability.cloned(),
})
.await?;
}
Some(("status", subcommand_args)) => { Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await { match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"), Ok(_) => println!("Page server is up and running"),
@@ -1231,7 +1248,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
match ComputeControlPlane::load(env.clone()) { match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => { Ok(cplane) => {
for (_k, node) in cplane.endpoints { for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
eprintln!("postgres stop failed: {e:#}"); eprintln!("postgres stop failed: {e:#}");
} }
} }
@@ -1417,7 +1434,6 @@ fn cli() -> Command {
.subcommand( .subcommand(
Command::new("timeline") Command::new("timeline")
.about("Manage timelines") .about("Manage timelines")
.arg_required_else_help(true)
.subcommand(Command::new("list") .subcommand(Command::new("list")
.about("List all timelines, available to this pageserver") .about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone())) .arg(tenant_id_arg.clone()))
@@ -1499,6 +1515,12 @@ fn cli() -> Command {
.about("Restart local pageserver") .about("Restart local pageserver")
.arg(pageserver_config_args.clone()) .arg(pageserver_config_args.clone())
) )
.subcommand(Command::new("set-state")
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
.about("Set scheduling or availability state of pageserver node")
.arg(pageserver_config_args.clone())
)
) )
.subcommand( .subcommand(
Command::new("storage_controller") Command::new("storage_controller")

View File

@@ -156,7 +156,6 @@ pub struct SafekeeperConf {
pub remote_storage: Option<String>, pub remote_storage: Option<String>,
pub backup_threads: Option<u32>, pub backup_threads: Option<u32>,
pub auth_enabled: bool, pub auth_enabled: bool,
pub listen_addr: Option<String>,
} }
impl Default for SafekeeperConf { impl Default for SafekeeperConf {
@@ -170,7 +169,6 @@ impl Default for SafekeeperConf {
remote_storage: None, remote_storage: None,
backup_threads: None, backup_threads: None,
auth_enabled: false, auth_enabled: false,
listen_addr: None,
} }
} }
} }

View File

@@ -389,10 +389,6 @@ impl PageServerNode {
.remove("image_creation_threshold") .remove("image_creation_threshold")
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose()?, .transpose()?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout") .remove("walreceiver_connect_timeout")
@@ -505,12 +501,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>()) .map(|x| x.parse::<usize>())
.transpose() .transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?, .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout") .remove("walreceiver_connect_timeout")

View File

@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig, pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv, pub env: LocalEnv,
pub http_client: reqwest::Client, pub http_client: reqwest::Client,
pub listen_addr: String,
pub http_base_url: String, pub http_base_url: String,
} }
impl SafekeeperNode { impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
listen_addr.clone()
} else {
"127.0.0.1".to_string()
};
SafekeeperNode { SafekeeperNode {
id: conf.id, id: conf.id,
conf: conf.clone(), conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(), env: env.clone(),
http_client: reqwest::Client::new(), http_client: reqwest::Client::new(),
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
listen_addr,
} }
} }
/// Construct libpq connection string for connecting to this safekeeper. /// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
} }
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
); );
io::stdout().flush().unwrap(); io::stdout().flush().unwrap();
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id; let id = self.id;
let datadir = self.datadir_path(); let datadir = self.datadir_path();
@@ -146,7 +139,7 @@ impl SafekeeperNode {
availability_zone, availability_zone,
]; ];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
} }
if !self.conf.sync { if !self.conf.sync {

View File

@@ -1,23 +0,0 @@
[package]
name = "storcon_cli"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -1,681 +0,0 @@
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use hyper::{Method, StatusCode};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::Url;
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
};
#[derive(Subcommand, Debug)]
enum Command {
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
/// since pageservers auto-register when they start up
NodeRegister {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
listen_pg_addr: String,
#[arg(long)]
listen_pg_port: u16,
#[arg(long)]
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
},
/// Modify a node's configuration in the storage controller
NodeConfigure {
#[arg(long)]
node_id: NodeId,
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
/// manually mark a node offline
#[arg(long)]
availability: Option<NodeAvailabilityArg>,
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
#[arg(long)]
scheduling: Option<NodeSchedulingPolicy>,
},
/// Modify a tenant's policies in the storage controller
TenantPolicy {
#[arg(long)]
tenant_id: TenantId,
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
/// or is in the normal attached state with N secondary locations (`attached:N`)
#[arg(long)]
placement: Option<PlacementPolicyArg>,
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
/// unavailable, and are only for use in emergencies.
#[arg(long)]
scheduling: Option<ShardSchedulingPolicyArg>,
},
/// List nodes known to the storage controller
Nodes {},
/// List tenants known to the storage controller
Tenants {},
/// Create a new tenant in the storage controller, and by extension on pageservers.
TenantCreate {
#[arg(long)]
tenant_id: TenantId,
},
/// Delete a tenant in the storage controller, and by extension on pageservers.
TenantDelete {
#[arg(long)]
tenant_id: TenantId,
},
/// Split an existing tenant into a higher number of shards than its current shard count.
TenantShardSplit {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
shard_count: u8,
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
#[arg(long)]
stripe_size: Option<u32>,
},
/// Migrate the attached location for a tenant shard to a specific pageserver.
TenantShardMigrate {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
config: String,
},
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
/// alternative to the storage controller's scheduling optimization behavior.
TenantScatter {
#[arg(long)]
tenant_id: TenantId,
},
/// Print details about a particular tenant, including all its shards' states.
TenantDescribe {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
}
#[derive(Parser)]
#[command(
author,
version,
about,
long_about = "CLI for Storage Controller Support/Debug"
)]
#[command(arg_required_else_help(true))]
struct Cli {
#[arg(long)]
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
api: Url,
#[arg(long)]
/// JWT token for authenticating with storage controller. Depending on the API used, this
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[command(subcommand)]
command: Command,
}
#[derive(Debug, Clone)]
struct PlacementPolicyArg(PlacementPolicy);
impl FromStr for PlacementPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"detached" => Ok(Self(PlacementPolicy::Detached)),
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
_ if s.starts_with("attached:") => {
let mut splitter = s.split(':');
let _prefix = splitter.next().unwrap();
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
None => Err(anyhow::anyhow!(
"Invalid format '{s}', a valid example is 'attached:1'"
)),
}
}
_ => Err(anyhow::anyhow!(
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
)),
}
}
}
#[derive(Debug, Clone)]
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
impl FromStr for ShardSchedulingPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
_ => Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
)),
}
}
}
#[derive(Debug, Clone)]
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
impl FromStr for NodeAvailabilityArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
} => {
storcon_client
.dispatch::<_, ()>(
Method::POST,
"control/v1/node".to_string(),
Some(NodeRegisterRequest {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
}),
)
.await?;
}
Command::TenantCreate { tenant_id } => {
vps_client
.tenant_create(&TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters::default(),
placement_policy: Some(PlacementPolicy::Attached(1)),
config: TenantConfig::default(),
})
.await?;
}
Command::TenantDelete { tenant_id } => {
let status = vps_client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await?;
tracing::info!("Delete status: {}", status);
}
Command::Nodes {} => {
let resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
for node in resp {
table.add_row([
format!("{}", node.id),
node.listen_http_addr,
format!("{:?}", node.scheduling),
format!("{:?}", node.availability),
]);
}
println!("{table}");
}
Command::NodeConfigure {
node_id,
availability,
scheduling,
} => {
let req = NodeConfigureRequest {
node_id,
availability: availability.map(|a| a.0),
scheduling,
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{node_id}/config"),
Some(req),
)
.await?;
}
Command::Tenants {} => {
let resp = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
"ShardCount",
"StripeSize",
"Placement",
"Scheduling",
]);
for tenant in resp {
let shard_zero = tenant.shards.into_iter().next().unwrap();
table.add_row([
format!("{}", tenant.tenant_id),
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
format!("{:?}", tenant.stripe_size),
format!("{:?}", tenant.policy),
format!("{:?}", shard_zero.scheduling_policy),
]);
}
println!("{table}");
}
Command::TenantPolicy {
tenant_id,
placement,
scheduling,
} => {
let req = TenantPolicyRequest {
scheduling: scheduling.map(|s| s.0),
placement: placement.map(|p| p.0),
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/policy"),
Some(req),
)
.await?;
}
Command::TenantShardSplit {
tenant_id,
shard_count,
stripe_size,
} => {
let req = TenantShardSplitRequest {
new_shard_count: shard_count,
new_stripe_size: stripe_size.map(ShardStripeSize),
};
let response = storcon_client
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(req),
)
.await?;
println!(
"Split tenant {} into {} shards: {}",
tenant_id,
shard_count,
response
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Command::TenantShardMigrate {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: tenant_conf,
})
.await?;
}
Command::TenantScatter { tenant_id } => {
// Find the shards
let locate_response = storcon_client
.dispatch::<(), TenantLocateResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await?;
let shards = locate_response.shards;
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
let shard_count = shards.len();
for s in shards {
let entry = node_to_shards.entry(s.node_id).or_default();
entry.push(s.shard_id);
}
// Load list of available nodes
let nodes_resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
for node in nodes_resp {
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
node_to_shards.entry(node.id).or_default();
}
}
let max_shard_per_node = shard_count / node_to_shards.len();
loop {
let mut migrate_shard = None;
for shards in node_to_shards.values_mut() {
if shards.len() > max_shard_per_node {
// Pick the emptiest
migrate_shard = Some(shards.pop().unwrap());
}
}
let Some(migrate_shard) = migrate_shard else {
break;
};
// Pick the emptiest node to migrate to
let mut destinations = node_to_shards
.iter()
.map(|(k, v)| (k, v.len()))
.collect::<Vec<_>>();
destinations.sort_by_key(|i| i.1);
let (destination_node, destination_count) = *destinations.first().unwrap();
if destination_count + 1 > max_shard_per_node {
// Even the emptiest destination doesn't have space: we're done
break;
}
let destination_node = *destination_node;
node_to_shards
.get_mut(&destination_node)
.unwrap()
.push(migrate_shard);
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{migrate_shard}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id: migrate_shard,
node_id: destination_node,
}),
)
.await?;
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
}
// Spread the shards across the nodes
}
Command::TenantDescribe { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let shards = describe_response.shards;
let mut table = comfy_table::Table::new();
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
for shard in shards {
let secondary = shard
.node_secondary
.iter()
.map(|n| format!("{}", n))
.collect::<Vec<_>>()
.join(",");
let mut status_parts = Vec::new();
if shard.is_reconciling {
status_parts.push("reconciling");
}
if shard.is_pending_compute_notification {
status_parts.push("pending_compute");
}
if shard.is_splitting {
status_parts.push("splitting");
}
let status = status_parts.join(",");
table.add_row([
format!("{}", shard.tenant_shard_id),
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or(String::new()),
secondary,
shard.last_error,
status,
]);
}
println!("{table}");
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
}
Ok(())
}

View File

@@ -2,8 +2,8 @@
# see https://diesel.rs/guides/configuring-diesel-cli # see https://diesel.rs/guides/configuring-diesel-cli
[print_schema] [print_schema]
file = "storage_controller/src/schema.rs" file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"] custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory] [migrations_directory]
dir = "storage_controller/migrations" dir = "control_plane/attachment_service/migrations"

View File

@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
Neon storage broker, providing messaging between safekeepers and pageservers. Neon storage broker, providing messaging between safekeepers and pageservers.
[storage_broker.md](./storage_broker.md) [storage_broker.md](./storage_broker.md)
`storage_controller`:
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
managing a many-sharded tenant as a single entity.
`/control_plane`: `/control_plane`:
Local control plane. Local control plane.

View File

@@ -1,150 +0,0 @@
# Storage Controller
## Concepts
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
the underlying details of how data is spread across multiple nodes.
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
## APIs
The storage controllers HTTP server implements four logically separate APIs:
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because thats where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controllers API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
to ensure data safety with generation numbers.
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers APIs).
See the `http.rs` file in the source for where the HTTP APIs are implemented.
## Database
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controllers database.
### Diesel tip: migrations
If you need to modify the database schema, heres how to create a migration:
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.
## storcon_cli
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
`storcon_cli --help` includes details on commands.
# Deploying
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
part of a self-hosted system.
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
reference when figuring out deployment._
## Database
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
Set the URL to the database using the `--database-url` CLI option.
There is no need to run migrations manually: the storage controller automatically applies migrations
when it starts up.
## Configure pageservers to use the storage controller
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
with the storage controller when it starts up. See the example below for the format of this file.
### Example `metadata.json`
```
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
```
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
the storage controller runs.
## Handle compute notifications.
The storage controller independently moves tenant attachments between pageservers in response to
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
### Example notification body
```
{
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
"stripe_size": 32768,
"shards": [
{"node_id": 344, "shard_number": 0},
{"node_id": 722, "shard_number": 1},
],
}
```

View File

@@ -10,13 +10,11 @@ libc.workspace = true
once_cell.workspace = true once_cell.workspace = true
chrono.workspace = true chrono.workspace = true
twox-hash.workspace = true twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies] [target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true procfs.workspace = true
measured-process.workspace = true
[dev-dependencies] [dev-dependencies]
rand = "0.8" rand = "0.8"

View File

@@ -7,19 +7,14 @@
//! use significantly less memory than this, but can only approximate the cardinality. //! use significantly less memory than this, but can only approximate the cardinality.
use std::{ use std::{
hash::{BuildHasher, BuildHasherDefault, Hash}, collections::HashMap,
sync::atomic::AtomicU8, hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
}; };
use measured::{ use prometheus::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, core::{self, Describer},
metric::{ proto, Opts,
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
}; };
use twox_hash::xxh3; use twox_hash::xxh3;
@@ -98,25 +93,203 @@ macro_rules! register_hll {
/// ``` /// ```
/// ///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>; #[derive(Clone)]
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>; pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
pub struct HyperLogLogState<const N: usize> {
shards: [AtomicU8; N],
} }
impl<const N: usize> Default for HyperLogLogState<N> {
fn default() -> Self { struct HyperLogLogVecCore<const N: usize> {
#[allow(clippy::declare_interior_mutable_const)] pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
const ZERO: AtomicU8 = AtomicU8::new(0); pub desc: core::Desc,
Self { shards: [ZERO; N] } pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
} }
} }
impl<const N: usize> MetricType for HyperLogLogState<N> { impl<const N: usize> HyperLogLogVec<N> {
type Metadata = (); /// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
} }
impl<const N: usize> HyperLogLogState<N> { impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) { pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements. // changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item)); self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
let p = N.ilog2() as u8; let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1); let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p; let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
} }
fn take_sample(&self) -> [u8; N] { fn collect(&self) -> Vec<proto::MetricFamily> {
self.shards.each_ref().map(|x| { let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition, // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling. // this would mean that a dev port-forwarding the metrics url won't break the sampling.
x.swap(0, std::sync::atomic::Ordering::Relaxed) let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
}) })
} }
} }
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N> fn make_label_pairs(
{ desc: &core::Desc,
fn write_type( label_values: &[&str],
name: impl MetricNameEncoder, ) -> prometheus::Result<Vec<proto::LabelPair>> {
enc: &mut TextEncoder<W>, if desc.variable_labels.len() != label_values.len() {
) -> Result<(), std::io::Error> { return Err(prometheus::Error::InconsistentCardinality {
enc.write_type(&name, measured::text::MetricType::Gauge) expect: desc.variable_labels.len(),
got: label_values.len(),
});
} }
fn collect_into(
&self,
_: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
struct I64(i64);
impl LabelValue for I64 {
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0)
}
}
struct HllShardLabel { let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
hll_shard: i64, if total_len == 0 {
} return Ok(vec![]);
impl LabelGroup for HllShardLabel {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const LE: &LabelName = LabelName::from_str("hll_shard");
v.write_value(LE, &I64(self.hll_shard));
}
}
self.take_sample()
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
MetricValue::Int(val as i64),
)
})
} }
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::collections::HashSet; use std::collections::HashSet;
use measured::{label::StaticLabelSet, FixedCardinalityLabel}; use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng}; use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf}; use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec; use crate::HyperLogLogVec;
#[derive(FixedCardinalityLabel, Clone, Copy)] fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
#[label(singleton = "x")] let mut metrics = vec![];
enum Label { hll.core
A, .children
B, .read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
} }
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
// cannot go through the `hll.collect_family_into` interface yet...
// need to see if I can fix the conflicting impls problem in measured.
(
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
)
}
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
let mut buckets = [0.0; 32]; let mut buckets = [0.0; 32];
for &sample in samples { for metric in metrics.chunks_exact(32) {
for (i, m) in sample.into_iter().enumerate() { if filter(&metric[0]) {
buckets[i] = f64::max(buckets[i], m as f64); for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
} }
} }
@@ -238,7 +437,7 @@ mod tests {
} }
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) { fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new(); let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new(); let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits()); set_a.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::A)) hll.with_label_values(&["a"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
for x in iter.by_ref().take(n) { for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits()); set_b.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::B)) hll.with_label_values(&["b"]).measure(&x.to_bits());
.measure(&x.to_bits());
} }
let merge = &set_a | &set_b; let merge = &set_a | &set_b;
let (a, b) = collect(&hll); let metrics = collect(&hll);
let len = get_cardinality(&[a, b]); let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&[a]); let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&[b]); let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
} }

View File

@@ -4,17 +4,6 @@
//! a default registry. //! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)] #![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use prometheus::core::{ use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
pub use prometheus::opts; pub use prometheus::opts;
pub use prometheus::register; pub use prometheus::register;
pub use prometheus::Error; pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto}; pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec}; pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder}; pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp; pub mod launch_timestamp;
mod wrappers; mod wrappers;
pub use wrappers::{CountedReader, CountedWriter}; pub use wrappers::{CountedReader, CountedWriter};
mod hll; mod hll;
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; pub use hll::{HyperLogLog, HyperLogLogVec};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
pub mod more_process_metrics; pub mod more_process_metrics;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock. /// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> { pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
INTERNAL_REGISTRY.register(c) INTERNAL_REGISTRY.register(c)
} }
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
]; ];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) { pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!( let metric = register_int_gauge_vec!(
"libmetrics_build_info", "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric"); .expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1); metric.with_label_values(&[revision, build_tag]).set(1);
} }
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way. // Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
fn update_rusage_metrics() { fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats(); let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES DISK_IO_BYTES
.with_label_values(&["read"]) .with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
} }
}}; }};
} }
/// Create an [`IntCounterPair`] and registers to default registry. /// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)] #[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair { macro_rules! register_int_counter_pair {
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
/// ///
/// An error is returned if the number of label values is not the same as the /// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc. /// number of VariableLabels in Desc.
pub fn get_metric_with_label_values( pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
Ok(GenericCounterPair { Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?, inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?,
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap() self.get_metric_with_label_values(vals).unwrap()
} }
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals); res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals); res[1] = self.dec.remove_label_values(vals);
} }
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>; pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
pub trait CounterPairAssoc {
const INC_NAME: &'static MetricName;
const DEC_NAME: &'static MetricName;
const INC_HELP: &'static str;
const DEC_HELP: &'static str;
type LabelGroupSet: LabelGroupSet;
}
pub struct CounterPairVec<A: CounterPairAssoc> {
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
where
A::LabelGroupSet: Default,
{
fn default() -> Self {
Self {
vec: Default::default(),
}
}
}
impl<A: CounterPairAssoc> CounterPairVec<A> {
pub fn guard(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> MeasuredCounterPairGuard<'_, A> {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
MeasuredCounterPairGuard { vec: &self.vec, id }
}
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
}
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).dec.inc();
}
pub fn remove_metric(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> Option<MeasuredCounterPairState> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
where
T: ::measured::metric::group::Encoding,
A: CounterPairAssoc,
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
// write decrement first to avoid a race condition where inc - dec < 0
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
self.vec
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
self.vec
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
Ok(())
}
}
#[derive(MetricGroup, Default)]
pub struct MeasuredCounterPairState {
pub inc: CounterState,
pub dec: CounterState,
}
impl measured::metric::MetricType for MeasuredCounterPairState {
type Metadata = ();
}
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
id: measured::metric::LabelId<A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
fn drop(&mut self) {
self.vec.get_metric(self.id).dec.inc();
}
}
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
struct Inc<T>(T);
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
struct Dec<T>(T);
impl<T: Encoding> Encoding for Inc<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Inc<T>,
) -> Result<(), T::Err> {
self.inc.collect_into(metadata, labels, name, &mut enc.0)
}
}
impl<T: Encoding> Encoding for Dec<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Dec<T>,
) -> Result<(), T::Err> {
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}

View File

@@ -2,9 +2,9 @@ use std::str::FromStr;
/// Request/response types for the storage controller /// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server /// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`] /// in [`attachment_service::http`]
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId}; use utils::id::NodeId;
use crate::{ use crate::{
models::{ShardParameters, TenantConfig}, models::{ShardParameters, TenantConfig},
@@ -42,12 +42,6 @@ pub struct NodeConfigureRequest {
pub scheduling: Option<NodeSchedulingPolicy>, pub scheduling: Option<NodeSchedulingPolicy>,
} }
#[derive(Serialize, Deserialize)]
pub struct TenantPolicyRequest {
pub placement: Option<PlacementPolicy>,
pub scheduling: Option<ShardSchedulingPolicy>,
}
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard { pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId, pub shard_id: TenantShardId,
@@ -68,27 +62,12 @@ pub struct TenantLocateResponse {
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponse { pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>, pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize, pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy, pub policy: PlacementPolicy,
pub config: TenantConfig, pub config: TenantConfig,
} }
#[derive(Serialize, Deserialize)]
pub struct NodeDescribeResponse {
pub id: NodeId,
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponseShard { pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId, pub tenant_shard_id: TenantShardId,
@@ -104,8 +83,6 @@ pub struct TenantDescribeResponseShard {
pub is_pending_compute_notification: bool, pub is_pending_compute_notification: bool,
/// A shard split is currently underway /// A shard split is currently underway
pub is_splitting: bool, pub is_splitting: bool,
pub scheduling_policy: ShardSchedulingPolicy,
} }
/// Explicitly migrating a particular shard is a low level operation /// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +97,7 @@ pub struct TenantShardMigrateRequest {
/// Utilisation score indicating how good a candidate a pageserver /// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better. /// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct UtilizationScore(pub u64); pub struct UtilizationScore(pub u64);
impl UtilizationScore { impl UtilizationScore {
@@ -129,7 +106,7 @@ impl UtilizationScore {
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[derive(Serialize, Clone, Copy)]
#[serde(into = "NodeAvailabilityWrapper")] #[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability { pub enum NodeAvailability {
// Normal, happy state // Normal, happy state
@@ -152,7 +129,7 @@ impl Eq for NodeAvailability {}
// This wrapper provides serde functionality and it should only be used to // This wrapper provides serde functionality and it should only be used to
// communicate with external callers which don't know or care about the // communicate with external callers which don't know or care about the
// utilisation score of the pageserver it is targeting. // utilisation score of the pageserver it is targeting.
#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[derive(Serialize, Deserialize, Clone)]
pub enum NodeAvailabilityWrapper { pub enum NodeAvailabilityWrapper {
Active, Active,
Offline, Offline,
@@ -178,33 +155,22 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] impl FromStr for NodeAvailability {
pub enum ShardSchedulingPolicy { type Err = anyhow::Error;
// Normal mode: the tenant's scheduled locations may be updated at will, including
// for non-essential optimization.
Active,
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. fn from_str(s: &str) -> Result<Self, Self::Err> {
// For example, this still permits a node's attachment location to change to a secondary in match s {
// response to a node failure, or to assign a new secondary if a node was removed. // This is used when parsing node configuration requests from neon-local.
Essential, // Assume the worst possible utilisation score
// and let it get updated via the heartbeats.
// No scheduling: leave the shard running wherever it currently is. Even if the shard is "active" => Ok(Self::Active(UtilizationScore::worst())),
// unavailable, it will not be rescheduled to another node. "offline" => Ok(Self::Offline),
Pause, _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
Stop,
}
impl Default for ShardSchedulingPolicy {
fn default() -> Self {
Self::Active
} }
} }
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy { pub enum NodeSchedulingPolicy {
Active, Active,
Filling, Filling,

View File

@@ -20,7 +20,6 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter, history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId}, id::{NodeId, TenantId, TimelineId},
lsn::Lsn, lsn::Lsn,
serde_system_time,
}; };
use crate::controller_api::PlacementPolicy; use crate::controller_api::PlacementPolicy;
@@ -302,7 +301,6 @@ pub struct TenantConfig {
pub heatmap_period: Option<String>, pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>, pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>, pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -747,18 +745,10 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>, pub gc_horizon: Option<u64>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
pub kind: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus { pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>, pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub process: Option<WalRedoManagerProcessStatus>, pub pid: Option<u32>,
} }
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -767,7 +757,11 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)] #[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress { pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded. /// The remote storage LastModified time of the heatmap object we last downloaded.
pub heatmap_mtime: Option<serde_system_time::SystemTime>, #[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
/// The number of layers currently on-disk /// The number of layers currently on-disk
pub layers_downloaded: usize, pub layers_downloaded: usize,
@@ -780,6 +774,29 @@ pub struct SecondaryProgress {
pub bytes_total: u64, pub bytes_total: u64,
} }
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
ts: &Option<SystemTime>,
serializer: S,
) -> Result<S::Ok, S::Error> {
match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
None => serializer.serialize_none(),
}
}
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
match s {
None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
.map_err(serde::de::Error::custom)
.map(Some),
}
}
pub mod virtual_file { pub mod virtual_file {
#[derive( #[derive(
Copy, Copy,

View File

@@ -1,4 +1,4 @@
use utils::serde_system_time::SystemTime; use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant. /// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time. /// When was this snapshot captured, pageserver local time.
/// ///
/// Use millis to give confidence that the value is regenerated often enough. /// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime, pub captured_at: SystemTime,
} }
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
/// ///
/// Instead of newtype, use this because a newtype would get require handling deserializing values /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
disk_usage_bytes: u64::MAX, disk_usage_bytes: u64::MAX,
free_space_bytes: 0, free_space_bytes: 0,
utilization_score: u64::MAX, utilization_score: u64::MAX,
captured_at: SystemTime( captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
}; };
let s = serde_json::to_string(&doc).unwrap(); let s = serde_json::to_string(&doc).unwrap();

View File

@@ -5,93 +5,15 @@ use crate::{
models::ShardParameters, models::ShardParameters,
}; };
use hex::FromHex; use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use utils::id::TenantId; use utils::id::TenantId;
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
///
/// This module contains a variety of types used to represent the concept of sharding
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
/// we provide an summary here.
///
/// Types used to describe shards:
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
/// a shard suffix.
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
/// tenant, such as layer files.
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
/// four hex digits. An unsharded tenant is `0000`.
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
///
/// Types used to describe the parameters for data distribution in a sharded tenant:
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
/// multiple shards. Its value is given in 8kiB pages.
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
/// always zero: this is provided for future upgrades that might introduce different
/// data distribution schemes.
///
/// Examples:
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
/// and their slugs are 0004, 0104, 0204, and 0304.
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8); pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8); pub struct ShardCount(u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
/// and to check whether that [`ShardNumber`] is the same as the current shard.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
/// Formatting helper, for generating the `shard_id` label in traces.
struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount { impl ShardCount {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
@@ -116,7 +38,6 @@ impl ShardCount {
self.0 self.0
} }
///
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.0 == 0 self.0 == 0
} }
@@ -132,6 +53,33 @@ impl ShardNumber {
pub const MAX: Self = Self(u8::MAX); pub const MAX: Self = Self(u8::MAX);
} }
/// TenantShardId identify the units of work for the Pageserver.
///
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
///
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// Historically, tenants could not have multiple shards, and were identified
/// by TenantId. To support this, TenantShardId has a special legacy
/// mode where `shard_count` is equal to zero: this represents a single-sharded
/// tenant which should be written as a TenantId with no suffix.
///
/// The human-readable encoding of TenantShardId, such as used in API URLs,
/// is both forward and backward compatible: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
///
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl TenantShardId { impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self { pub fn unsharded(tenant_id: TenantId) -> Self {
Self { Self {
@@ -163,13 +111,10 @@ impl TenantShardId {
} }
/// Convenience for code that has special behavior on the 0th shard. /// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0) self.shard_number == ShardNumber(0)
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
} }
@@ -205,6 +150,9 @@ impl TenantShardId {
} }
} }
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> { impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!( write!(
@@ -274,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
} }
} }
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex { impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self { pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self { Self {
@@ -288,9 +246,6 @@ impl ShardIndex {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
} }
@@ -358,8 +313,6 @@ impl Serialize for TenantShardId {
if serializer.is_human_readable() { if serializer.is_human_readable() {
serializer.collect_str(self) serializer.collect_str(self)
} else { } else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18]; let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0; packed[16] = self.shard_number.0;
@@ -437,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size. /// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)] #[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError { pub enum ShardConfigError {
#[error("Invalid shard count")] #[error("Invalid shard count")]
@@ -476,9 +439,6 @@ impl ShardIdentity {
} }
} }
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool { pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0) self.number == ShardNumber(0) && self.count == ShardCount(0)
} }
@@ -527,8 +487,6 @@ impl ShardIdentity {
} }
/// Return true if the key should be ingested by this shard /// Return true if the key should be ingested by this shard
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool { pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken()); assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -538,28 +496,8 @@ impl ShardIdentity {
} }
} }
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
///
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
/// as a symptom of that issue.
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
return false;
}
let mut hash = murmurhash32(key.field4);
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
// The key may be affected by issue #7454: it is an initfork and it would not
// have mapped to shard 0 until we fixed that issue.
mapped_shard != ShardNumber(0)
}
/// Return true if the key should be discarded if found in this shard's /// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split. /// data store, e.g. during compaction after a split
///
/// Shards _may_ drop keys which return false here, but are not obliged to.
pub fn is_key_disposable(&self, key: &Key) -> bool { pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) { if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0? // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -585,7 +523,7 @@ impl ShardIdentity {
/// Convenience for checking if this identity is the 0th shard in a tenant, /// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes. /// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_shard_zero(&self) -> bool { pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0) self.number == ShardNumber(0)
} }
} }
@@ -668,13 +606,7 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets // relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup // stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations. // requests, and any request other than those for particular blocks in relations.
// !is_rel_block_key(key)
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
// because they must be included in basebackups.
let is_initfork = key.field5 == INIT_FORKNUM;
!is_rel_block_key(key) || is_initfork
} }
/// Provide the same result as the function in postgres `hashfn.h` with the same name /// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -565,16 +565,6 @@ impl GenericRemoteStorage {
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct StorageMetadata(HashMap<String, String>); pub struct StorageMetadata(HashMap<String, String>);
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
fn from(arr: [(&str, &str); N]) -> Self {
let map: HashMap<String, String> = arr
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self(map)
}
}
/// External backup storage configuration, enough for creating a client for that storage. /// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig { pub struct RemoteStorageConfig {

View File

@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
Disabled, Disabled,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage { impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
remote_blobs: HashSet<RemotePath>, remote_blobs: HashSet<RemotePath>,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();
@@ -146,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>, remote_blobs: HashSet<RemotePath>,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();

View File

@@ -219,6 +219,7 @@ enum MaybeEnabledStorage {
Disabled, Disabled,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage { impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();
@@ -247,6 +248,7 @@ struct S3WithTestBlobs {
remote_blobs: HashSet<RemotePath>, remote_blobs: HashSet<RemotePath>,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();
@@ -308,6 +310,7 @@ struct S3WithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>, remote_blobs: HashSet<RemotePath>,
} }
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self { async fn setup() -> Self {
ensure_logging_ready(); ensure_logging_ready();

View File

@@ -22,7 +22,6 @@ camino.workspace = true
chrono.workspace = true chrono.workspace = true
heapless.workspace = true heapless.workspace = true
hex = { workspace = true, features = ["serde"] } hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] } hyper = { workspace = true, features = ["full"] }
fail.workspace = true fail.workspace = true
futures = { workspace = true} futures = { workspace = true}

View File

@@ -1,21 +0,0 @@
//! Wrapper around `std::env::var` for parsing environment variables.
use std::{fmt::Display, str::FromStr};
pub fn var<V, E>(varname: &str) -> Option<V>
where
V: FromStr<Err = E>,
E: Display,
{
match std::env::var(varname) {
Ok(s) => Some(
s.parse()
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
.unwrap(),
),
Err(std::env::VarError::NotPresent) => None,
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {varname} is not unicode")
}
}
}

View File

@@ -63,7 +63,6 @@ pub mod measured_stream;
pub mod serde_percent; pub mod serde_percent;
pub mod serde_regex; pub mod serde_regex;
pub mod serde_system_time;
pub mod pageserver_feedback; pub mod pageserver_feedback;
@@ -90,10 +89,6 @@ pub mod yielding_loop;
pub mod zstd; pub mod zstd;
pub mod env;
pub mod poison;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
/// ///
/// we have several cases: /// we have several cases:

View File

@@ -1,121 +0,0 @@
//! Protect a piece of state from reuse after it is left in an inconsistent state.
//!
//! # Example
//!
//! ```
//! # tokio_test::block_on(async {
//! use utils::poison::Poison;
//! use std::time::Duration;
//!
//! struct State {
//! clean: bool,
//! }
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
//!
//! let mut mutex_guard = state.lock().await;
//! let mut poison_guard = mutex_guard.check_and_arm()?;
//! let state = poison_guard.data_mut();
//! state.clean = false;
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
//! tokio::time::sleep(Duration::from_secs(10)).await;
//! state.clean = true;
//! poison_guard.disarm();
//! # Ok::<(), utils::poison::Error>(())
//! # });
//! ```
use tracing::warn;
pub struct Poison<T> {
what: &'static str,
state: State,
data: T,
}
#[derive(Clone, Copy)]
enum State {
Clean,
Armed,
Poisoned { at: chrono::DateTime<chrono::Utc> },
}
impl<T> Poison<T> {
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
pub fn new(what: &'static str, data: T) -> Self {
Self {
what,
state: State::Clean,
data,
}
}
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
match self.state {
State::Clean => {
self.state = State::Armed;
Ok(Guard(self))
}
State::Armed => unreachable!("transient state"),
State::Poisoned { at } => Err(Error::Poisoned {
what: self.what,
at,
}),
}
}
}
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
/// Once modifications are done, use [`Self::disarm`].
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
pub struct Guard<'a, T>(&'a mut Poison<T>);
impl<'a, T> Guard<'a, T> {
pub fn data(&self) -> &T {
&self.0.data
}
pub fn data_mut(&mut self) -> &mut T {
&mut self.0.data
}
pub fn disarm(self) {
match self.0.state {
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
State::Armed => {
self.0.state = State::Clean;
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
impl<'a, T> Drop for Guard<'a, T> {
fn drop(&mut self) {
match self.0.state {
State::Clean => {
// set by disarm()
}
State::Armed => {
// still armed => poison it
let at = chrono::Utc::now();
self.0.state = State::Poisoned { at };
warn!(at=?at, "poisoning {}", self.0.what);
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("poisoned at {at}: {what}")]
Poisoned {
what: &'static str,
at: chrono::DateTime<chrono::Utc>,
},
}

View File

@@ -182,18 +182,6 @@ where
} }
} }
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
let internal = self.internal.lock().unwrap();
let cnt = internal.current.cnt_value();
drop(internal);
if cnt >= num {
Ok(())
} else {
Err(cnt)
}
}
/// Register and return a channel that will be notified when a number arrives, /// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived. /// or None, if it has already arrived.
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> { fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {

View File

@@ -1,55 +0,0 @@
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct SystemTime(
#[serde(
deserialize_with = "deser_rfc3339_millis",
serialize_with = "ser_rfc3339_millis"
)]
pub std::time::SystemTime,
);
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
ts: &std::time::SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
Ok(duration) => {
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
SystemTime(
std::time::SystemTime::UNIX_EPOCH
+ std::time::Duration::from_millis(total_millis),
)
}
Err(_) => time,
}
}
#[test]
fn test_serialize_deserialize() {
let input = SystemTime(std::time::SystemTime::now());
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
let serialized = serde_json::to_string(&input).unwrap();
assert_eq!(expected_serialized, serialized);
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
assert_eq!(to_millisecond_precision(input), deserialized);
}
}

View File

@@ -192,14 +192,6 @@ impl<T> OnceCell<T> {
} }
} }
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
/// initialized.
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let inner = self.inner.get_mut().unwrap();
inner.take_and_deinit()
}
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
pub fn initializer_count(&self) -> usize { pub fn initializer_count(&self) -> usize {
self.initializers.load(Ordering::Relaxed) self.initializers.load(Ordering::Relaxed)
@@ -254,23 +246,15 @@ impl<'a, T> Guard<'a, T> {
/// The permit will be on a semaphore part of the new internal value, and any following /// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete. /// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(mut self) -> (T, InitPermit) { pub fn take_and_deinit(mut self) -> (T, InitPermit) {
self.0
.take_and_deinit()
.expect("guard is not created unless value has been initialized")
}
}
impl<T> Inner<T> {
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let value = self.value.take()?;
let mut swapped = Inner::default(); let mut swapped = Inner::default();
let sem = swapped.init_semaphore.clone(); let sem = swapped.init_semaphore.clone();
// acquire and forget right away, moving the control over to InitPermit // acquire and forget right away, moving the control over to InitPermit
sem.try_acquire().expect("we just created this").forget(); sem.try_acquire().expect("we just created this").forget();
let permit = InitPermit(sem); std::mem::swap(&mut *self.0, &mut swapped);
std::mem::swap(self, &mut swapped); swapped
Some((value, permit)) .value
.map(|v| (v, InitPermit(sem)))
.expect("guard is not created unless value has been initialized")
} }
} }
@@ -279,13 +263,6 @@ impl<T> Inner<T> {
/// On drop, this type will return the permit. /// On drop, this type will return the permit.
pub struct InitPermit(Arc<tokio::sync::Semaphore>); pub struct InitPermit(Arc<tokio::sync::Semaphore>);
impl std::fmt::Debug for InitPermit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let ptr = Arc::as_ptr(&self.0) as *const ();
f.debug_tuple("InitPermit").field(&ptr).finish()
}
}
impl Drop for InitPermit { impl Drop for InitPermit {
fn drop(&mut self) { fn drop(&mut self) {
assert_eq!( assert_eq!(
@@ -582,22 +559,4 @@ mod tests {
assert_eq!(*target.get().unwrap(), 11); assert_eq!(*target.get().unwrap(), 11);
} }
#[tokio::test]
async fn take_and_deinit_on_mut() {
use std::convert::Infallible;
let mut target = OnceCell::<u32>::default();
assert!(target.take_and_deinit().is_none());
target
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
.await
.unwrap();
let again = target.take_and_deinit();
assert!(matches!(again, Some((42, _))), "{again:?}");
assert!(target.take_and_deinit().is_none());
}
} }

View File

@@ -27,50 +27,30 @@
//! //!
//! # Reference Numbers //! # Reference Numbers
//! //!
//! 2024-04-15 on i3en.3xlarge //! 2024-03-20 on i3en.3xlarge
//! //!
//! ```text //! ```text
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs] //! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs] //! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs] //! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs] //! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs] //! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs] //! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs] //! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms] //! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs] //! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs] //! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs] //! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs] //! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] //! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] //! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] //! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
//! ``` //! ```
use bytes::{Buf, Bytes}; use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion}; use criterion::{BenchmarkId, Criterion};
use pageserver::{ use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
config::PageServerConf,
walrecord::NeonWalRecord,
walredo::{PostgresRedoManager, ProcessKind},
};
use pageserver_api::{key::Key, shard::TenantShardId}; use pageserver_api::{key::Key, shard::TenantShardId};
use std::{ use std::{
sync::Arc, sync::Arc,
@@ -80,39 +60,33 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn}; use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) { fn bench(c: &mut Criterion) {
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] { {
{ let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; for nclients in nclients {
for nclients in nclients { let mut group = c.benchmark_group("short");
let mut group = c.benchmark_group(format!("{process_kind}-short")); group.bench_with_input(
group.bench_with_input( BenchmarkId::from_parameter(nclients),
BenchmarkId::from_parameter(nclients), &nclients,
&nclients, |b, nclients| {
|b, nclients| { let redo_work = Arc::new(Request::short_input());
let redo_work = Arc::new(Request::short_input()); b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
b.iter_custom(|iters| { },
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) );
});
},
);
}
} }
}
{ {
let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients { for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-medium")); let mut group = c.benchmark_group("medium");
group.bench_with_input( group.bench_with_input(
BenchmarkId::from_parameter(nclients), BenchmarkId::from_parameter(nclients),
&nclients, &nclients,
|b, nclients| { |b, nclients| {
let redo_work = Arc::new(Request::medium_input()); let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| { b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients) },
}); );
},
);
}
} }
} }
} }
@@ -120,16 +94,10 @@ criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches); criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl( fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
process_kind: ProcessKind,
redo_work: Arc<Request>,
n_redos: u64,
nclients: u64,
) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
conf.walredo_process_kind = process_kind;
let conf = Box::leak(Box::new(conf)); let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -145,40 +113,25 @@ fn bench_impl(
let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager); let manager = Arc::new(manager);
// divide the amount of work equally among the clients.
let nredos_per_client = n_redos / nclients;
for _ in 0..nclients { for _ in 0..nclients {
rt.block_on(async { rt.block_on(async {
tasks.spawn(client( tasks.spawn(client(
Arc::clone(&manager), Arc::clone(&manager),
Arc::clone(&start), Arc::clone(&start),
Arc::clone(&redo_work), Arc::clone(&redo_work),
nredos_per_client, // divide the amount of work equally among the clients
n_redos / nclients,
)) ))
}); });
} }
let elapsed = rt.block_on(async move { rt.block_on(async move {
let mut total_wallclock_time = Duration::ZERO; let mut total_wallclock_time = std::time::Duration::from_millis(0);
while let Some(res) = tasks.join_next().await { while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap(); total_wallclock_time += res.unwrap();
} }
total_wallclock_time total_wallclock_time
}); })
// consistency check to ensure process kind setting worked
if nredos_per_client > 0 {
assert_eq!(
manager
.status()
.process
.map(|p| p.kind)
.expect("the benchmark work causes a walredo process to be spawned"),
std::borrow::Cow::Borrowed(process_kind.into())
);
}
elapsed
} }
async fn client( async fn client(

View File

@@ -128,12 +128,12 @@ impl Client {
pub async fn timeline_info( pub async fn timeline_info(
&self, &self,
tenant_shard_id: TenantShardId, tenant_id: TenantId,
timeline_id: TimelineId, timeline_id: TimelineId,
force_await_logical_size: ForceAwaitLogicalSize, force_await_logical_size: ForceAwaitLogicalSize,
) -> Result<pageserver_api::models::TimelineInfo> { ) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!( let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint self.mgmt_api_endpoint
); );
@@ -151,11 +151,11 @@ impl Client {
pub async fn keyspace( pub async fn keyspace(
&self, &self,
tenant_shard_id: TenantShardId, tenant_id: TenantId,
timeline_id: TimelineId, timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> { ) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!( let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint self.mgmt_api_endpoint
); );
self.get(&uri) self.get(&uri)

View File

@@ -11,6 +11,7 @@ default = []
anyhow.workspace = true anyhow.workspace = true
async-compression.workspace = true async-compression.workspace = true
async-stream.workspace = true async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true byteorder.workspace = true
bytes.workspace = true bytes.workspace = true
chrono = { workspace = true, features = ["serde"] } chrono = { workspace = true, features = ["serde"] }

View File

@@ -43,8 +43,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
fanout: u64, fanout: u64,
ctx: &E::RequestContext, ctx: &E::RequestContext,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); assert!(fanout >= 2);
let exp_base = fanout.max(2);
// Start at L0 // Start at L0
let mut current_level_no = 0; let mut current_level_no = 0;
let mut current_level_target_height = target_file_size; let mut current_level_target_height = target_file_size;
@@ -107,7 +106,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
break; break;
} }
current_level_no += 1; current_level_no += 1;
current_level_target_height = current_level_target_height.saturating_mul(exp_base); current_level_target_height = current_level_target_height.saturating_mul(fanout);
} }
Ok(()) Ok(())
} }

View File

@@ -180,7 +180,7 @@ where
match top.deref_mut() { match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => { LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx); let fut = l.load_keys(this.ctx);
this.load_future.set(Some(Box::pin(fut))); this.load_future.set(Some(fut));
continue; continue;
} }
LazyLoadLayer::Loaded(ref mut entries) => { LazyLoadLayer::Loaded(ref mut entries) => {

View File

@@ -3,6 +3,7 @@
//! //!
//! All the heavy lifting is done by the create_image and create_delta //! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides. //! functions that the implementor provides.
use async_trait::async_trait;
use futures::Future; use futures::Future;
use pageserver_api::{key::Key, keyspace::key_range_size}; use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range; use std::ops::Range;
@@ -140,16 +141,18 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn is_delta(&self) -> bool; fn is_delta(&self) -> bool;
} }
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> { pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where where
Self: 'a; Self: 'a;
/// Return all keys in this delta layer. /// Return all keys in this delta layer.
fn load_keys<'a>( async fn load_keys<'a>(
&self, &self,
ctx: &E::RequestContext, ctx: &E::RequestContext,
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send; ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
} }
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {} pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}

View File

@@ -2,6 +2,7 @@ mod draw;
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
use async_trait::async_trait;
use futures::StreamExt; use futures::StreamExt;
use rand::Rng; use rand::Rng;
use tracing::info; use tracing::info;
@@ -138,6 +139,7 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
} }
} }
#[async_trait]
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> { impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
type DeltaEntry<'a> = MockRecord; type DeltaEntry<'a> = MockRecord;

View File

@@ -12,14 +12,9 @@ bytes.workspace = true
camino.workspace = true camino.workspace = true
clap = { workspace = true, features = ["string"] } clap = { workspace = true, features = ["string"] }
git-version.workspace = true git-version.workspace = true
humantime.workspace = true
pageserver = { path = ".." } pageserver = { path = ".." }
pageserver_api.workspace = true
remote_storage = { path = "../../libs/remote_storage" }
postgres_ffi.workspace = true postgres_ffi.workspace = true
tokio.workspace = true tokio.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true
utils.workspace = true utils.workspace = true
svg_fmt.workspace = true svg_fmt.workspace = true
workspace_hack.workspace = true workspace_hack.workspace = true

View File

@@ -9,11 +9,6 @@ mod index_part;
mod layer_map_analyzer; mod layer_map_analyzer;
mod layers; mod layers;
use std::{
str::FromStr,
time::{Duration, SystemTime},
};
use camino::{Utf8Path, Utf8PathBuf}; use camino::{Utf8Path, Utf8PathBuf};
use clap::{Parser, Subcommand}; use clap::{Parser, Subcommand};
use index_part::IndexPartCmd; use index_part::IndexPartCmd;
@@ -25,16 +20,8 @@ use pageserver::{
tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
virtual_file, virtual_file,
}; };
use pageserver_api::shard::TenantShardId;
use postgres_ffi::ControlFileData; use postgres_ffi::ControlFileData;
use remote_storage::{RemotePath, RemoteStorageConfig}; use utils::{lsn::Lsn, project_git_version};
use tokio_util::sync::CancellationToken;
use utils::{
id::TimelineId,
logging::{self, LogFormat, TracingErrorLayerEnablement},
lsn::Lsn,
project_git_version,
};
project_git_version!(GIT_VERSION); project_git_version!(GIT_VERSION);
@@ -56,7 +43,6 @@ enum Commands {
#[command(subcommand)] #[command(subcommand)]
IndexPart(IndexPartCmd), IndexPart(IndexPartCmd),
PrintLayerFile(PrintLayerFileCmd), PrintLayerFile(PrintLayerFileCmd),
TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
DrawTimeline {}, DrawTimeline {},
AnalyzeLayerMap(AnalyzeLayerMapCmd), AnalyzeLayerMap(AnalyzeLayerMapCmd),
#[command(subcommand)] #[command(subcommand)]
@@ -82,26 +68,6 @@ struct PrintLayerFileCmd {
path: Utf8PathBuf, path: Utf8PathBuf,
} }
/// Roll back the time for the specified prefix using S3 history.
///
/// The command is fairly low level and powerful. Validation is only very light,
/// so it is more powerful, and thus potentially more dangerous.
#[derive(Parser)]
struct TimeTravelRemotePrefixCmd {
/// A configuration string for the remote_storage configuration.
///
/// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
config_toml_str: String,
/// remote prefix to time travel recover. For safety reasons, we require it to contain
/// a timeline or tenant ID in the prefix.
prefix: String,
/// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
travel_to: String,
/// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
/// You can use a few seconds before invoking the command. Same format as `travel_to`.
done_if_after: Option<String>,
}
#[derive(Parser)] #[derive(Parser)]
struct AnalyzeLayerMapCmd { struct AnalyzeLayerMapCmd {
/// Pageserver data path /// Pageserver data path
@@ -112,14 +78,6 @@ struct AnalyzeLayerMapCmd {
#[tokio::main] #[tokio::main]
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
logging::init(
LogFormat::Plain,
TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
let cli = CliOpts::parse(); let cli = CliOpts::parse();
match cli.command { match cli.command {
@@ -147,42 +105,6 @@ async fn main() -> anyhow::Result<()> {
print_layerfile(&cmd.path).await?; print_layerfile(&cmd.path).await?;
} }
} }
Commands::TimeTravelRemotePrefix(cmd) => {
let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
.map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
humantime::parse_rfc3339(done_if_after).map_err(|_e| {
anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
})?
} else {
const SAFETY_MARGIN: Duration = Duration::from_secs(3);
tokio::time::sleep(SAFETY_MARGIN).await;
// Convert to string representation and back to get rid of sub-second values
let done_if_after = SystemTime::now();
tokio::time::sleep(SAFETY_MARGIN).await;
done_if_after
};
let timestamp = strip_subsecond(timestamp);
let done_if_after = strip_subsecond(done_if_after);
let Some(prefix) = validate_prefix(&cmd.prefix) else {
println!("specified prefix '{}' failed validation", cmd.prefix);
return Ok(());
};
let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let cancel = CancellationToken::new();
storage
.unwrap()
.time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
.await?;
}
}; };
Ok(()) Ok(())
} }
@@ -263,89 +185,3 @@ fn handle_metadata(
Ok(()) Ok(())
} }
/// Ensures that the given S3 prefix is sufficiently constrained.
/// The command is very risky already and we don't want to expose something
/// that allows usually unintentional and quite catastrophic time travel of
/// an entire bucket, which would be a major catastrophy and away
/// by only one character change (similar to "rm -r /home /username/foobar").
fn validate_prefix(prefix: &str) -> Option<RemotePath> {
if prefix.is_empty() {
// Empty prefix means we want to specify the *whole* bucket
return None;
}
let components = prefix.split('/').collect::<Vec<_>>();
let (last, components) = {
let last = components.last()?;
if last.is_empty() {
(
components.iter().nth_back(1)?,
&components[..(components.len() - 1)],
)
} else {
(last, &components[..])
}
};
'valid: {
if let Ok(_timeline_id) = TimelineId::from_str(last) {
// Ends in either a tenant or timeline ID
break 'valid;
}
if *last == "timelines" {
if let Some(before_last) = components.iter().nth_back(1) {
if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
// Has a valid tenant id
break 'valid;
}
}
}
return None;
}
RemotePath::from_string(prefix).ok()
}
fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_validate_prefix() {
assert_eq!(validate_prefix(""), None);
assert_eq!(validate_prefix("/"), None);
#[track_caller]
fn assert_valid(prefix: &str) {
let remote_path = RemotePath::from_string(prefix).unwrap();
assert_eq!(validate_prefix(prefix), Some(remote_path));
}
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
// Path is not relative but absolute
assert_eq!(
validate_prefix(
"/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
),
None
);
assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
// Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
assert_eq!(validate_prefix("wal"), None);
assert_eq!(validate_prefix("/wal/"), None);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
// Partial tenant ID
assert_eq!(
validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
None
);
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
}
}

View File

@@ -1,5 +1,4 @@
use anyhow::Context; use anyhow::Context;
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
use pageserver_client::page_service::BasebackupRequest; use pageserver_client::page_service::BasebackupRequest;
@@ -96,7 +95,7 @@ async fn main_impl(
let timeline = *timeline; let timeline = *timeline;
let info = mgmt_api_client let info = mgmt_api_client
.timeline_info( .timeline_info(
TenantShardId::unsharded(timeline.tenant_id), timeline.tenant_id,
timeline.timeline_id, timeline.timeline_id,
ForceAwaitLogicalSize::No, ForceAwaitLogicalSize::No,
) )

View File

@@ -4,7 +4,6 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::keyspace::KeySpaceAccum;
use pageserver_api::models::PagestreamGetPageRequest; use pageserver_api::models::PagestreamGetPageRequest;
use pageserver_api::shard::TenantShardId;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use utils::id::TenantTimelineId; use utils::id::TenantTimelineId;
use utils::lsn::Lsn; use utils::lsn::Lsn;
@@ -174,10 +173,7 @@ async fn main_impl(
let timeline = *timeline; let timeline = *timeline;
async move { async move {
let partitioning = mgmt_api_client let partitioning = mgmt_api_client
.keyspace( .keyspace(timeline.tenant_id, timeline.timeline_id)
TenantShardId::unsharded(timeline.tenant_id),
timeline.timeline_id,
)
.await?; .await?;
let lsn = partitioning.at_lsn; let lsn = partitioning.at_lsn;
let start = Instant::now(); let start = Instant::now();

View File

@@ -1,7 +1,6 @@
use std::sync::Arc; use std::sync::Arc;
use humantime::Duration; use humantime::Duration;
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet; use tokio::task::JoinSet;
use utils::id::TenantTimelineId; use utils::id::TenantTimelineId;
@@ -60,11 +59,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
let mgmt_api_client = Arc::clone(&mgmt_api_client); let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move { js.spawn(async move {
let info = mgmt_api_client let info = mgmt_api_client
.timeline_info( .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.await .await
.unwrap(); .unwrap();
@@ -79,11 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
while !info.current_logical_size_is_accurate { while !info.current_logical_size_is_accurate {
ticker.tick().await; ticker.tick().await;
info = mgmt_api_client info = mgmt_api_client
.timeline_info( .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
TenantShardId::unsharded(tl.tenant_id),
tl.timeline_id,
ForceAwaitLogicalSize::Yes,
)
.await .await
.unwrap(); .unwrap();
} }

View File

@@ -13,7 +13,7 @@
use anyhow::{anyhow, bail, ensure, Context}; use anyhow::{anyhow, bail, ensure, Context};
use bytes::{BufMut, Bytes, BytesMut}; use bytes::{BufMut, Bytes, BytesMut};
use fail::fail_point; use fail::fail_point;
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key}; use pageserver_api::key::{key_to_slru_block, Key};
use postgres_ffi::pg_constants; use postgres_ffi::pg_constants;
use std::fmt::Write as FmtWrite; use std::fmt::Write as FmtWrite;
use std::time::SystemTime; use std::time::SystemTime;
@@ -297,20 +297,7 @@ where
if rel.forknum == INIT_FORKNUM { if rel.forknum == INIT_FORKNUM {
// I doubt we need _init fork itself, but having it at least // I doubt we need _init fork itself, but having it at least
// serves as a marker relation is unlogged. // serves as a marker relation is unlogged.
if let Err(_e) = self.add_rel(rel, rel).await { self.add_rel(rel, rel).await?;
if self
.timeline
.get_shard_identity()
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
{
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
// recreate.
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
continue;
}
};
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?; self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
continue; continue;
} }

View File

@@ -18,7 +18,6 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources}; use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage; use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant; use tokio::time::Instant;
use tracing::*; use tracing::*;
@@ -285,7 +284,6 @@ fn start_pageserver(
)) ))
.unwrap(); .unwrap();
pageserver::preinitialize_metrics(); pageserver::preinitialize_metrics();
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
// If any failpoints were set from FAILPOINTS environment variable, // If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes // print them to the log for debugging purposes
@@ -673,37 +671,42 @@ fn start_pageserver(
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal. // All started up! Now just sit and wait for shutdown signal.
{ {
BACKGROUND_RUNTIME.block_on(async move { use signal_hook::consts::*;
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); let mut signals =
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
let signal = tokio::select! { return signals
_ = sigquit.recv() => { .forever()
info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); .next()
std::process::exit(111); .expect("forever() never returns None unless explicitly closed");
} });
_ = sigint.recv() => { "SIGINT" }, let signal = BACKGROUND_RUNTIME
_ = sigterm.recv() => { "SIGTERM" }, .block_on(signal_handler)
}; .expect("join error");
match signal {
SIGQUIT => {
info!("Got signal {signal}. Terminating in immediate shutdown mode",);
std::process::exit(111);
}
SIGINT | SIGTERM => {
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); // This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// This cancels the `shutdown_pageserver` cancellation tree. // The plan is to change that over time.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead. shutdown_pageserver.take();
// The plan is to change that over time. let bg_remote_storage = remote_storage.clone();
shutdown_pageserver.take(); let bg_deletion_queue = deletion_queue.clone();
let bg_remote_storage = remote_storage.clone(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
let bg_deletion_queue = deletion_queue.clone(); &tenant_manager,
pageserver::shutdown_pageserver( bg_remote_storage.map(|_| bg_deletion_queue),
&tenant_manager, 0,
bg_remote_storage.map(|_| bg_deletion_queue), ));
0, unreachable!()
) }
.await; _ => unreachable!(),
unreachable!() }
})
} }
} }

View File

@@ -97,8 +97,6 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
/// ///
/// Default built-in configuration file. /// Default built-in configuration file.
/// ///
@@ -142,8 +140,6 @@ pub mod defaults {
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
[tenant_config] [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -294,8 +290,6 @@ pub struct PageServerConf {
/// ///
/// Setting this to zero disables limits on total ephemeral layer size. /// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize, pub ephemeral_bytes_per_memory_kb: usize,
pub walredo_process_kind: crate::walredo::ProcessKind,
} }
/// We do not want to store this in a PageServerConf because the latter may be logged /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -419,8 +413,6 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>, validate_vectored_get: BuilderValue<bool>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>, ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
} }
impl PageServerConfigBuilder { impl PageServerConfigBuilder {
@@ -508,8 +500,6 @@ impl PageServerConfigBuilder {
)), )),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
} }
} }
} }
@@ -693,10 +683,6 @@ impl PageServerConfigBuilder {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
} }
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
self.walredo_process_kind = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> { pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values(); let default = Self::default_values();
@@ -753,7 +739,6 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes, max_vectored_read_bytes,
validate_vectored_get, validate_vectored_get,
ephemeral_bytes_per_memory_kb, ephemeral_bytes_per_memory_kb,
walredo_process_kind,
} }
CUSTOM LOGIC CUSTOM LOGIC
{ {
@@ -1047,9 +1032,6 @@ impl PageServerConf {
"ephemeral_bytes_per_memory_kb" => { "ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
} }
"walredo_process_kind" => {
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
}
_ => bail!("unrecognized pageserver option '{key}'"), _ => bail!("unrecognized pageserver option '{key}'"),
} }
} }
@@ -1132,7 +1114,6 @@ impl PageServerConf {
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
} }
} }
} }
@@ -1370,8 +1351,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant") .expect("Invalid default constant")
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}, },
"Correct defaults should be used when no config values are provided" "Correct defaults should be used when no config values are provided"
); );
@@ -1443,8 +1423,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant") .expect("Invalid default constant")
), ),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
}, },
"Should be able to parse all basic config values correctly" "Should be able to parse all basic config values correctly"
); );

View File

@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
continue; continue;
} }
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating // We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards. // synthetic size on other shards.
continue; continue;

View File

@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
}; };
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
if state != TenantState::Active || !id.is_shard_zero() { if state != TenantState::Active || !id.is_zero() {
None None
} else { } else {
tenant_manager tenant_manager

View File

@@ -12,7 +12,7 @@ use pageserver_api::{
use serde::{de::DeserializeOwned, Serialize}; use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use url::Url; use url::Url;
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use utils::{backoff, generation::Generation, id::NodeId};
use crate::{ use crate::{
config::{NodeMetadata, PageServerConf}, config::{NodeMetadata, PageServerConf},
@@ -210,10 +210,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(), .collect(),
}; };
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); fail::fail_point!("control-plane-client-validate");
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

View File

@@ -58,6 +58,24 @@ paths:
responses: responses:
"200": "200":
description: The reload completed successfully. description: The reload completed successfully.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error (also hits if no keys were found)
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}: /v1/tenant/{tenant_id}:
parameters: parameters:
@@ -75,14 +93,62 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete: delete:
description: | description: |
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished" 404 means that deletion successfully finished"
responses: responses:
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404": "404":
description: Tenant not found. This is the success path. description: Tenant not found
content: content:
application/json: application/json:
schema: schema:
@@ -99,6 +165,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PreconditionFailedError" $ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/time_travel_remote_storage: /v1/tenant/{tenant_id}/time_travel_remote_storage:
parameters: parameters:
@@ -128,6 +206,36 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Error when no tenant id found in path or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline: /v1/tenant/{tenant_id}/timeline:
parameters: parameters:
@@ -147,6 +255,36 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}: /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -171,12 +309,60 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
delete: delete:
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
responses: responses:
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404": "404":
description: Timeline not found. This is the success path. description: Timeline not found
content: content:
application/json: application/json:
schema: schema:
@@ -193,6 +379,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PreconditionFailedError" $ref: "#/components/schemas/PreconditionFailedError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
parameters: parameters:
@@ -225,6 +423,36 @@ paths:
schema: schema:
type: string type: string
format: date-time format: date-time
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Timeline not found, or there is no timestamp information for the given lsn
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
parameters: parameters:
@@ -256,6 +484,36 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/LsnByTimestampResponse" $ref: "#/components/schemas/LsnByTimestampResponse"
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
parameters: parameters:
@@ -279,6 +537,36 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Error when no tenant id found in path, no timeline id or invalid timestamp
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_shard_id}/location_config: /v1/tenant/{tenant_shard_id}/location_config:
parameters: parameters:
- name: tenant_shard_id - name: tenant_shard_id
@@ -340,6 +628,24 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantLocationConfigResponse" $ref: "#/components/schemas/TenantLocationConfigResponse"
"503":
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409": "409":
description: | description: |
The tenant is already known to Pageserver in some way, The tenant is already known to Pageserver in some way,
@@ -356,6 +662,12 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/ConflictError" $ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/ignore: /v1/tenant/{tenant_id}/ignore:
parameters: parameters:
- name: tenant_id - name: tenant_id
@@ -372,6 +684,36 @@ paths:
responses: responses:
"200": "200":
description: Tenant ignored description: Tenant ignored
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/load: /v1/tenant/{tenant_id}/load:
@@ -398,6 +740,36 @@ paths:
responses: responses:
"202": "202":
description: Tenant scheduled to load successfully description: Tenant scheduled to load successfully
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
parameters: parameters:
@@ -418,6 +790,37 @@ paths:
responses: responses:
"202": "202":
description: Tenant scheduled to load successfully description: Tenant scheduled to load successfully
"404":
description: No tenant or timeline found for the specified ids
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size: /v1/tenant/{tenant_id}/synthetic_size:
parameters: parameters:
@@ -436,8 +839,31 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/SyntheticSizeResponse" $ref: "#/components/schemas/SyntheticSizeResponse"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
# This route has no handler. TODO: remove?
/v1/tenant/{tenant_id}/size: /v1/tenant/{tenant_id}/size:
parameters: parameters:
- name: tenant_id - name: tenant_id
@@ -519,6 +945,18 @@ paths:
responses: responses:
"200": "200":
description: Success description: Success
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_shard_id}/secondary/download: /v1/tenant/{tenant_shard_id}/secondary/download:
parameters: parameters:
@@ -549,6 +987,20 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/SecondaryProgress" $ref: "#/components/schemas/SecondaryProgress"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/timeline/: /v1/tenant/{tenant_id}/timeline/:
parameters: parameters:
@@ -591,6 +1043,24 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TimelineInfo" $ref: "#/components/schemas/TimelineInfo"
"400":
description: Malformed timeline create request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"406": "406":
description: Permanently unsatisfiable request, don't retry. description: Permanently unsatisfiable request, don't retry.
content: content:
@@ -609,6 +1079,18 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/Error" $ref: "#/components/schemas/Error"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/: /v1/tenant/:
get: get:
@@ -622,6 +1104,30 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
post: post:
description: | description: |
@@ -642,12 +1148,43 @@ paths:
application/json: application/json:
schema: schema:
type: string type: string
"400":
description: Malformed tenant create request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409": "409":
description: Tenant already exists, creation skipped description: Tenant already exists, creation skipped
content: content:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/ConflictError" $ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/config: /v1/tenant/config:
put: put:
@@ -669,6 +1206,36 @@ paths:
type: array type: array
items: items:
$ref: "#/components/schemas/TenantInfo" $ref: "#/components/schemas/TenantInfo"
"400":
description: Malformed tenant config request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/config/: /v1/tenant/{tenant_id}/config/:
parameters: parameters:
@@ -688,6 +1255,42 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/TenantConfigResponse" $ref: "#/components/schemas/TenantConfigResponse"
"400":
description: Malformed get tenanant config request
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Tenand or timeline were not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/utilization: /v1/utilization:
get: get:
@@ -701,6 +1304,12 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/PageserverUtilization" $ref: "#/components/schemas/PageserverUtilization"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
components: components:
securitySchemes: securitySchemes:
@@ -1020,7 +1629,7 @@ components:
type: integer type: integer
format: int64 format: int64
minimum: 0 minimum: 0
description: The amount of disk space currently used. description: The amount of disk space currently utilized by layer files.
free_space_bytes: free_space_bytes:
type: integer type: integer
format: int64 format: int64

View File

@@ -457,12 +457,8 @@ async fn reload_auth_validation_keys_handler(
json_response(StatusCode::OK, ()) json_response(StatusCode::OK, ())
} }
Err(e) => { Err(e) => {
let err_msg = "Error reloading public keys";
warn!("Error reloading public keys from {key_path:?}: {e:}"); warn!("Error reloading public keys from {key_path:?}: {e:}");
json_response( json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
StatusCode::INTERNAL_SERVER_ERROR,
HttpErrorBody::from_msg(err_msg.to_string()),
)
} }
} }
} }
@@ -700,7 +696,7 @@ async fn get_lsn_by_timestamp_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero // Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
@@ -751,7 +747,7 @@ async fn get_timestamp_of_lsn_handler(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero // Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
@@ -776,9 +772,7 @@ async fn get_timestamp_of_lsn_handler(
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
json_response(StatusCode::OK, time) json_response(StatusCode::OK, time)
} }
None => Err(ApiError::NotFound( None => json_response(StatusCode::NOT_FOUND, ()),
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
)),
} }
} }
@@ -999,26 +993,11 @@ async fn tenant_status(
check_permission(&request, Some(tenant_shard_id.tenant_id))?; check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request); let state = get_state(&request);
// In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
let activate = true;
#[cfg(feature = "testing")]
let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
let tenant_info = async { let tenant_info = async {
let tenant = state let tenant = state
.tenant_manager .tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?; .get_attached_tenant_shard(tenant_shard_id)?;
if activate {
// This is advisory: we prefer to let the tenant activate on-demand when this function is
// called, but it is still valid to return 200 and describe the current state of the tenant
// if it doesn't make it into an active state.
tenant
.wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
.await
.ok();
}
// Calculate total physical size of all timelines // Calculate total physical size of all timelines
let mut current_physical_size = 0; let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() { for timeline in tenant.list_timelines().iter() {
@@ -1092,7 +1071,7 @@ async fn tenant_size_handler(
let headers = request.headers(); let headers = request.headers();
let state = get_state(&request); let state = get_state(&request);
if !tenant_shard_id.is_shard_zero() { if !tenant_shard_id.is_zero() {
return Err(ApiError::BadRequest(anyhow!( return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero" "Size calculations are only available on shard zero"
))); )));

View File

@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes; use bytes::Bytes;
use camino::Utf8Path; use camino::Utf8Path;
use futures::StreamExt; use futures::StreamExt;
use pageserver_api::key::rel_block_to_key;
use tokio::io::{AsyncRead, AsyncReadExt}; use tokio::io::{AsyncRead, AsyncReadExt};
use tokio_tar::Archive; use tokio_tar::Archive;
use tracing::*; use tracing::*;
@@ -171,10 +170,7 @@ async fn import_rel(
let r = reader.read_exact(&mut buf).await; let r = reader.read_exact(&mut buf).await;
match r { match r {
Ok(_) => { Ok(_) => {
let key = rel_block_to_key(rel, blknum); modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
if modification.tline.get_shard_identity().is_key_local(&key) {
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
}
} }
// TODO: UnexpectedEof is expected // TODO: UnexpectedEof is expected

View File

@@ -1483,18 +1483,12 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
}); });
pub(crate) struct WalIngestMetrics { pub(crate) struct WalIngestMetrics {
pub(crate) bytes_received: IntCounter,
pub(crate) records_received: IntCounter, pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter, pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter, pub(crate) records_filtered: IntCounter,
} }
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics { pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
bytes_received: register_int_counter!(
"pageserver_wal_ingest_bytes_received",
"Bytes of WAL ingested from safekeepers",
)
.unwrap(),
records_received: register_int_counter!( records_received: register_int_counter!(
"pageserver_wal_ingest_records_received", "pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers" "Number of WAL records received from safekeepers"
@@ -1518,8 +1512,7 @@ pub(crate) struct SecondaryModeMetrics {
pub(crate) download_heatmap: IntCounter, pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter, pub(crate) download_layer: IntCounter,
} }
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| { pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
SecondaryModeMetrics {
upload_heatmap: register_int_counter!( upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap", "pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants" "Number of heatmaps written to remote storage by attached tenants"
@@ -1537,7 +1530,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
.expect("failed to define a metric"), .expect("failed to define a metric"),
download_heatmap: register_int_counter!( download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap", "pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" "Number of downloads of heatmaps by secondary mode locations"
) )
.expect("failed to define a metric"), .expect("failed to define a metric"),
download_layer: register_int_counter!( download_layer: register_int_counter!(
@@ -1545,7 +1538,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
"Number of downloads of layers by secondary mode locations" "Number of downloads of layers by secondary mode locations"
) )
.expect("failed to define a metric"), .expect("failed to define a metric"),
}
}); });
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1821,29 +1813,6 @@ impl Default for WalRedoProcessCounters {
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> = pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default); Lazy::new(WalRedoProcessCounters::default);
#[cfg(not(test))]
pub mod wal_redo {
use super::*;
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
std::sync::Mutex::new(
register_uint_gauge_vec!(
"pageserver_wal_redo_process_kind",
"The configured process kind for walredo",
&["kind"],
)
.unwrap(),
)
});
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
// use guard to avoid races around the next two steps
let guard = PROCESS_KIND.lock().unwrap();
guard.reset();
guard.with_label_values(&[&format!("{kind}")]).set(1);
}
}
/// Similar to `prometheus::HistogramTimer` but does not record on drop. /// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub(crate) struct StorageTimeMetricsTimer { pub(crate) struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics, metrics: StorageTimeMetrics,
@@ -2114,7 +2083,7 @@ impl TimelineMetrics {
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
// Only shard zero deals in synthetic sizes // Only shard zero deals in synthetic sizes
if tenant_shard_id.is_shard_zero() { if tenant_shard_id.is_zero() {
let tid = tenant_shard_id.tenant_id.to_string(); let tid = tenant_shard_id.tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
} }
@@ -2125,7 +2094,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
use futures::Future; use futures::Future;
use pin_project_lite::pin_project; use pin_project_lite::pin_project;
use std::collections::HashMap; use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin; use std::pin::Pin;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::task::{Context, Poll}; use std::task::{Context, Poll};
@@ -2695,26 +2663,6 @@ pub(crate) mod disk_usage_based_eviction {
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default); pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
} }
static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tokio_executor_thread_configured_count",
"Total number of configued tokio executor threads in the process.
The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
&["setup"],
)
.unwrap()
});
pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _guard = SERIALIZE.lock().unwrap();
TOKIO_EXECUTOR_THREAD_COUNT.reset();
TOKIO_EXECUTOR_THREAD_COUNT
.get_metric_with_label_values(&[setup])
.unwrap()
.set(u64::try_from(num_threads.get()).unwrap());
}
pub fn preinitialize_metrics() { pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting. // Python tests need these and on some we do alerting.
// //

Some files were not shown because too many files have changed in this diff Show More