Compare commits

..

1 Commits

Author SHA1 Message Date
Arseny Sher
b8fad41fbe Cap WAL download for LR on compute start by max_slot_wal_keep_size. 2023-12-22 15:55:45 +03:00
91 changed files with 1176 additions and 5488 deletions

View File

@@ -1,105 +0,0 @@
name: Build and Push Docker Image
on:
workflow_call:
inputs:
dockerfile-path:
required: true
type: string
image-name:
required: true
type: string
outputs:
build-tools-tag:
description: "tag generated for build tools"
value: ${{ jobs.tag.outputs.build-tools-tag }}
jobs:
check-if-build-tools-dockerfile-changed:
runs-on: ubuntu-latest
outputs:
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
steps:
- name: Check if Dockerfile.buildtools has changed
id: dockerfile
run: |
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
exit
fi
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
tag:
runs-on: ubuntu-latest
needs: [ check-if-build-tools-dockerfile-changed ]
outputs:
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
steps:
- name: Get buildtools tag
env:
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
run: |
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
IMAGE_TAG=$GITHUB_RUN_ID
else
IMAGE_TAG=pinned
fi
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
shell: bash
id: buildtools-tag
kaniko:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
kaniko-arm:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
needs: [ tag, check-if-build-tools-dockerfile-changed ]
runs-on: [ self-hosted, dev, arm64 ]
container: gcr.io/kaniko-project/executor:v1.7.0-debug
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
manifest:
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
name: 'manifest'
runs-on: [ self-hosted, dev, x64 ]
needs:
- tag
- kaniko
- kaniko-arm
- check-if-build-tools-dockerfile-changed
steps:
- name: Create manifest
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
- name: Push manifest
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}

View File

@@ -44,6 +44,7 @@ jobs:
exit 1
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
@@ -73,19 +74,11 @@ jobs:
shell: bash
id: build-tag
build-buildtools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
dockerfile-path: Dockerfile.buildtools
image-name: build-tools
secrets: inherit
check-codestyle-python:
needs: [ check-permissions, build-buildtools-image ]
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -115,10 +108,10 @@ jobs:
run: poetry run mypy .
check-codestyle-rust:
needs: [ check-permissions, build-buildtools-image ]
needs: [ check-permissions ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -182,10 +175,10 @@ jobs:
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions, tag, build-buildtools-image ]
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
strategy:
fail-fast: false
@@ -345,7 +338,7 @@ jobs:
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
@@ -415,10 +408,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
needs: [ check-permissions, build-neon, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
strategy:
@@ -454,10 +447,10 @@ jobs:
uses: ./.github/actions/save-coverage-data
benchmarks:
needs: [ check-permissions, build-neon, build-buildtools-image ]
needs: [ check-permissions, build-neon ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
# Default shared memory is 64mb
options: --init --shm-size=512mb
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -486,12 +479,12 @@ jobs:
# while coverage is currently collected for the debug ones
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
steps:
@@ -533,10 +526,11 @@ jobs:
})
coverage-report:
needs: [ check-permissions, regress-tests, build-buildtools-image ]
needs: [ check-permissions, regress-tests ]
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
strategy:
fail-fast: false
@@ -700,7 +694,7 @@ jobs:
}"
neon-image:
needs: [ check-permissions, build-buildtools-image, tag ]
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
@@ -739,7 +733,6 @@ jobs:
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -750,7 +743,7 @@ jobs:
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ check-permissions, build-buildtools-image, tag ]
needs: [ check-permissions, tag ]
container: gcr.io/kaniko-project/executor:v1.9.2-debug
defaults:
run:
@@ -785,7 +778,6 @@ jobs:
--context .
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -796,7 +788,7 @@ jobs:
run: rm -rf ~/.ecr
compute-node-image:
needs: [ check-permissions, build-buildtools-image, tag ]
needs: [ check-permissions, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -844,7 +836,6 @@ jobs:
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
--build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -866,7 +857,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.21.0
VM_BUILDER_VERSION: v0.19.0
steps:
- name: Checkout

View File

@@ -218,7 +218,7 @@ jobs:
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

View File

@@ -1,130 +0,0 @@
name: 'Update build tools image tag'
# This workflow it used to update tag of build tools in ECR.
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
on:
workflow_dispatch:
inputs:
from-tag:
description: 'Source tag'
required: true
type: string
to-tag:
description: 'Destination tag'
required: true
type: string
default: 'pinned'
defaults:
run:
shell: bash -euo pipefail {0}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
permissions: {}
jobs:
tag-image:
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
outputs:
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Get source image digest
id: next-digest
run: |
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
exit 1
fi
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
- name: Get destination image digest (if already exists)
id: prev-digest
run: |
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
if [ -z "${PREV_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
else
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
fi
- name: Tag image
run: |
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
rollback-tag-image:
needs: tag-image
if: ${{ !success() }}
runs-on: [ self-hosted, gen3, small ]
container: golang:1.19-bullseye
env:
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
FROM_TAG: ${{ inputs.from-tag }}
TO_TAG: ${{ inputs.to-tag }}
steps:
- name: Install Crane & ECR helper
run: |
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Restore previous tag if needed
run: |
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
if [ -z "${NEXT_DIGEST}" ]; then
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
exit 0
fi
if [ -z "${PREV_DIGEST}" ]; then
# I guess we should delete the tag here/untag the image, but crane does not support it
# - https://github.com/google/go-containerregistry/issues/999
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
exit 0
fi
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
else
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
fi

1
.gitignore vendored
View File

@@ -6,7 +6,6 @@ __pycache__/
test_output/
.vscode
.idea
neon.iml
/.neon
/integration_tests/.neon

View File

@@ -70,17 +70,3 @@ We're using the following approach to make it work:
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
## How do I add the "pinned" tag to an buildtools image?
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
or using GitHub CLI:
```bash
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-f from-tag=6254913013 \
-f to-tag=pinned \
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
```

36
Cargo.lock generated
View File

@@ -2106,20 +2106,6 @@ dependencies = [
"hashbrown 0.13.2",
]
[[package]]
name = "hdrhistogram"
version = "7.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
dependencies = [
"base64 0.21.1",
"byteorder",
"crossbeam-channel",
"flate2",
"nom",
"num-traits",
]
[[package]]
name = "heapless"
version = "0.8.0"
@@ -3071,28 +3057,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "pagebench"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"futures",
"hdrhistogram",
"humantime",
"humantime-serde",
"pageserver",
"pageserver_api",
"pageserver_client",
"rand 0.8.5",
"serde",
"serde_json",
"tokio",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "pagectl"
version = "0.1.0"

View File

@@ -6,7 +6,6 @@ members = [
"pageserver",
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
"proxy",
"safekeeper",
"storage_broker",
@@ -80,7 +79,6 @@ futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.13"
hashlink = "0.8.1"
hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
hmac = "0.12.1"

View File

@@ -3,7 +3,7 @@
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
### inside this image in the real deployments.
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG IMAGE=rust
ARG TAG=pinned
# Build Postgres

View File

@@ -1,165 +0,0 @@
FROM debian:bullseye-slim
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
# System deps
RUN set -e \
&& apt update \
&& apt install -y \
autoconf \
automake \
bison \
build-essential \
ca-certificates \
cmake \
curl \
flex \
git \
gnupg \
gzip \
jq \
libcurl4-openssl-dev \
libbz2-dev \
libffi-dev \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libpq-dev \
libreadline-dev \
libseccomp-dev \
libsqlite3-dev \
libssl-dev \
libstdc++-10-dev \
libtool \
libxml2-dev \
libxmlsec1-dev \
libxxhash-dev \
lsof \
make \
netcat \
net-tools \
openssh-client \
parallel \
pkg-config \
unzip \
wget \
xz-utils \
zlib1g-dev \
zstd \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# protobuf-compiler (protoc)
ENV PROTOC_VERSION 22.2
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
&& unzip -q protoc.zip -d protoc \
&& mv protoc/bin/protoc /usr/local/bin/protoc \
&& mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc
# LLVM
ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# PostgreSQL 14
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
&& apt update \
&& apt install -y postgresql-client-14 \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
&& unzip -q awscliv2.zip \
&& ./aws/install \
&& rm awscliv2.zip
# Mold: A Modern Linker
ENV MOLD_VERSION v2.1.0
RUN set -e \
&& git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \
&& cd mold/build \
&& git checkout ${MOLD_VERSION} \
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
&& cmake --build . -j $(nproc) \
&& cmake --install . \
&& cd .. \
&& rm -rf mold
# LCOV
# Build lcov from a fork:
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
# And patches from us:
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
&& cd lcov \
&& make install \
&& rm -rf ../lcov.tar.gz
# Switch to nonroot user
USER nonroot:nonroot
WORKDIR /home/nonroot
# Python
ENV PYTHON_VERSION=3.9.2 \
PYENV_ROOT=/home/nonroot/.pyenv \
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
RUN set -e \
&& cd $HOME \
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
&& chmod +x pyenv-installer \
&& ./pyenv-installer \
&& export PYENV_ROOT=/home/nonroot/.pyenv \
&& export PATH="$PYENV_ROOT/bin:$PATH" \
&& export PATH="$PYENV_ROOT/shims:$PATH" \
&& pyenv install ${PYTHON_VERSION} \
&& pyenv global ${PYTHON_VERSION} \
&& python --version \
&& pip install --upgrade pip \
&& pip --version \
&& pip install pipenv wheel poetry
# Switch to nonroot user (again)
USER nonroot:nonroot
WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.74.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
chmod +x rustup-init && \
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
rm rustup-init && \
export PATH="$HOME/.cargo/bin:$PATH" && \
. "$HOME/.cargo/env" && \
cargo --version && rustup --version && \
rustup component add llvm-tools-preview rustfmt clippy && \
cargo install --git https://github.com/paritytech/cachepot && \
cargo install rustfilt && \
cargo install cargo-hakari && \
cargo install cargo-deny && \
cargo install cargo-hack && \
rm -rf /home/nonroot/.cargo/registry && \
rm -rf /home/nonroot/.cargo/git
ENV RUSTC_WRAPPER=cachepot
# Show versions
RUN whoami \
&& python --version \
&& pip --version \
&& cargo --version --verbose \
&& rustup --version --verbose \
&& rustc --version --verbose \
&& clang --version

View File

@@ -1,6 +1,6 @@
ARG PG_VERSION
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG IMAGE=rust
ARG TAG=pinned
ARG BUILD_TAG

View File

@@ -1,7 +1,7 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG IMAGE=rust
ARG TAG=pinned
ARG BUILD_TAG

View File

@@ -69,12 +69,6 @@ pub fn write_postgres_conf(
)?;
}
writeln!(file, "shared_buffers=8MB")?;
if let Some(stripe_size) = &spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={}", stripe_size)?;
}
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {

View File

@@ -1,18 +1,8 @@
use crate::{background_process, local_env::LocalEnv};
use anyhow::anyhow;
use camino::Utf8PathBuf;
use hyper::{Method, StatusCode};
use pageserver_api::{
models::{
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
},
shard::TenantShardId,
};
use postgres_connection::parse_host_port;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use serde::{Deserialize, Serialize};
use std::{path::PathBuf, process::Child};
use tracing::instrument;
use utils::id::{NodeId, TenantId};
pub struct AttachmentService {
@@ -26,7 +16,7 @@ const COMMAND: &str = "attachment_service";
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub tenant_id: TenantId,
pub node_id: Option<NodeId>,
}
@@ -37,7 +27,7 @@ pub struct AttachHookResponse {
#[derive(Serialize, Deserialize)]
pub struct InspectRequest {
pub tenant_shard_id: TenantShardId,
pub tenant_id: TenantId,
}
#[derive(Serialize, Deserialize)]
@@ -45,46 +35,6 @@ pub struct InspectResponse {
pub attachment: Option<(u32, NodeId)>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_http_addr: String,
pub listen_http_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = env.base_data_dir.join("attachments.json");
@@ -117,87 +67,31 @@ impl AttachmentService {
pub async fn start(&self) -> anyhow::Result<Child> {
let path_str = self.path.to_string_lossy();
let result = background_process::start_process(
background_process::start_process(
COMMAND,
&self.env.base_data_dir,
&self.env.attachment_service_bin(),
["-l", &self.listen, "-p", &path_str],
[],
background_process::InitialPidFile::Create(self.pid_file()),
|| async {
match self.status().await {
Ok(_) => Ok(true),
Err(_) => Ok(false),
}
},
// TODO: a real status check
|| async move { anyhow::Ok(true) },
)
.await;
for ps_conf in &self.env.pageservers {
let (pg_host, pg_port) =
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
.expect("Unable to parse listen_http_addr");
self.node_register(NodeRegisterRequest {
node_id: ps_conf.id,
listen_pg_addr: pg_host.to_string(),
listen_pg_port: pg_port.unwrap_or(5432),
listen_http_addr: http_host.to_string(),
listen_http_port: http_port.unwrap_or(80),
})
.await?;
}
result
.await
}
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())
}
/// Simple HTTP request wrapper for calling into attachment service
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> anyhow::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
let url = self
.env
.control_plane_api
.clone()
.unwrap()
.join(&path)
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
let response = builder.send().await?;
if response.status() != StatusCode::OK {
return Err(anyhow!(
"Unexpected status {} on {}",
response.status(),
path
));
}
Ok(response.json().await?)
}
/// Call into the attach_hook API, for use before handing out attachments to pageservers
#[instrument(skip(self))]
pub async fn attach_hook(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
pageserver_id: NodeId,
) -> anyhow::Result<Option<u32>> {
use hyper::StatusCode;
let url = self
.env
.control_plane_api
@@ -207,7 +101,7 @@ impl AttachmentService {
.unwrap();
let request = AttachHookRequest {
tenant_shard_id,
tenant_id,
node_id: Some(pageserver_id),
};
@@ -220,11 +114,9 @@ impl AttachmentService {
Ok(response.gen)
}
#[instrument(skip(self))]
pub async fn inspect(
&self,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<Option<(u32, NodeId)>> {
pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
use hyper::StatusCode;
let url = self
.env
.control_plane_api
@@ -233,7 +125,7 @@ impl AttachmentService {
.join("inspect")
.unwrap();
let request = InspectRequest { tenant_shard_id };
let request = InspectRequest { tenant_id };
let response = self.client.post(url).json(&request).send().await?;
if response.status() != StatusCode::OK {
@@ -243,59 +135,4 @@ impl AttachmentService {
let response = response.json::<InspectResponse>().await?;
Ok(response.attachment)
}
#[instrument(skip(self))]
pub async fn tenant_create(
&self,
req: TenantCreateRequest,
) -> anyhow::Result<TenantCreateResponse> {
self.dispatch(Method::POST, "tenant".to_string(), Some(req))
.await
}
#[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
.await
}
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
pub async fn tenant_split(
&self,
tenant_id: TenantId,
new_shard_count: u8,
) -> anyhow::Result<TenantShardSplitResponse> {
self.dispatch(
Method::PUT,
format!("tenant/{tenant_id}/shard_split"),
Some(TenantShardSplitRequest { new_shard_count }),
)
.await
}
#[instrument(skip_all, fields(node_id=%req.node_id))]
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
.await
}
#[instrument(skip(self))]
pub async fn status(&self) -> anyhow::Result<()> {
self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
.await
}
#[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
pub async fn tenant_timeline_create(
&self,
tenant_id: TenantId,
req: TimelineCreateRequest,
) -> anyhow::Result<TimelineInfo> {
self.dispatch(
Method::POST,
format!("tenant/{tenant_id}/timeline"),
Some(req),
)
.await
}
}

View File

@@ -6,22 +6,14 @@
///
use anyhow::anyhow;
use clap::Parser;
use hex::FromHex;
use hyper::StatusCode;
use hyper::{Body, Request, Response};
use hyper::{Method, StatusCode};
use pageserver_api::models::{
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
TenantLocationConfigRequest, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
use reqwest::Client;
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::{collections::HashMap, sync::Arc};
use utils::http::endpoint::request_span;
use utils::http::request::parse_request_param;
use utils::id::TenantId;
use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal};
@@ -32,7 +24,7 @@ use utils::{
json::{json_request, json_response},
RequestExt, RouterBuilder,
},
id::NodeId,
id::{NodeId, TenantId},
tcp_listener,
};
@@ -42,9 +34,7 @@ use pageserver_api::control_api::{
};
use control_plane::attachment_service::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, NodeRegisterRequest,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantLocateResponseShard,
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
};
#[derive(Parser)]
@@ -60,71 +50,50 @@ struct Cli {
path: PathBuf,
}
/// Our latest knowledge of how this tenant is configured in the outside world.
///
/// Meaning:
/// * No instance of this type exists for a node: we are certain that we have nothing configured on that
/// node for this shard.
/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know
/// what it is (e.g. we failed partway through configuring it)
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
/// and that configuration will still be present unless something external interfered.
#[derive(Serialize, Deserialize)]
struct ObservedStateLocation {
/// If None, it means we do not know the status of this shard's location on this node, but
/// we know that we might have some state on this node.
conf: Option<LocationConfig>,
}
#[derive(Serialize, Deserialize, Default)]
struct ObservedState {
locations: HashMap<NodeId, ObservedStateLocation>,
}
#[derive(Serialize, Deserialize)]
// The persistent state of each Tenant
#[derive(Serialize, Deserialize, Clone)]
struct TenantState {
tenant_shard_id: TenantShardId,
shard: ShardIdentity,
// Currently attached pageserver
pageserver: Option<NodeId>,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
generation: u32,
observed: ObservedState,
config: TenantConfig,
}
#[derive(Serialize, Deserialize, Clone)]
struct NodeState {
id: NodeId,
fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
V: Clone + Serialize,
{
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
listen_http_addr: String,
listen_http_port: u16,
listen_pg_addr: String,
listen_pg_port: u16,
transformed
.collect::<HashMap<String, V>>()
.serialize(serializer)
}
impl NodeState {
fn base_url(&self) -> String {
format!(
"http://{}:{}/v1",
self.listen_http_addr, self.listen_http_port
)
}
fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
where
D: serde::de::Deserializer<'de>,
V: Deserialize<'de>,
{
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
hex_map
.into_iter()
.map(|(k, v)| {
TenantId::from_hex(k)
.map(|k| (k, v))
.map_err(serde::de::Error::custom)
})
.collect()
}
// Top level state available to all HTTP handlers
#[derive(Serialize, Deserialize)]
struct PersistentState {
tenants: BTreeMap<TenantShardId, TenantState>,
pageservers: HashMap<NodeId, NodeState>,
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
tenants: HashMap<TenantId, TenantState>,
#[serde(skip)]
path: PathBuf,
@@ -158,8 +127,7 @@ impl PersistentState {
{
tracing::info!("Will create state file at {}", path.display());
Self {
tenants: BTreeMap::new(),
pageservers: HashMap::new(),
tenants: HashMap::new(),
path: path.to_owned(),
}
}
@@ -192,126 +160,6 @@ fn get_state(request: &Request<Body>) -> &State {
.as_ref()
}
impl TenantState {
async fn location_config(
&self,
node: &NodeState,
config: LocationConfig,
) -> anyhow::Result<()> {
let configure_request = TenantLocationConfigRequest {
tenant_shard_id: self.tenant_shard_id,
config,
};
let client = Client::new();
let response = client
.request(
Method::PUT,
format!(
"{}/tenant/{}/location_config",
node.base_url(),
self.tenant_shard_id
),
)
.json(&configure_request)
.send()
.await?;
response.error_for_status()?;
Ok(())
}
async fn timeline_create(
&self,
node: &NodeState,
req: &TimelineCreateRequest,
) -> anyhow::Result<TimelineInfo> {
let client = Client::new();
let response = client
.request(
Method::POST,
format!(
"{}/tenant/{}/timeline",
node.base_url(),
self.tenant_shard_id
),
)
.json(req)
.send()
.await?;
response.error_for_status_ref()?;
Ok(response.json().await?)
}
fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
if self.pageserver.is_some() {
return Ok(());
}
self.pageserver = Some(scheduler.schedule_shard()?);
Ok(())
}
async fn reconcile(
&mut self,
pageservers: &HashMap<NodeId, NodeState>,
) -> Result<(), ReconcileError> {
let wanted_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: Some(self.generation),
secondary_conf: None,
shard_number: self.shard.number.0,
shard_count: self.shard.count.0,
shard_stripe_size: self.shard.stripe_size.0,
tenant_conf: self.config.clone(),
};
match self.pageserver {
Some(node_id) => {
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
// Nothing to do
tracing::info!("Observed configuration already correct.")
}
Some(_) | None => {
// If there is no observed configuration, or if its value does not equal our intent, then we must call out to the pageserver.
tracing::info!("Observed configuration requires update.");
let node = pageservers
.get(&node_id)
.expect("Pageserver may not be removed while referenced");
self.location_config(node, wanted_conf).await?;
}
}
}
None => {
// Detach everything
for node_id in self.observed.locations.keys() {
let node = pageservers
.get(node_id)
.expect("Pageserver may not be removed while referenced");
self.location_config(
node,
LocationConfig {
mode: LocationConfigMode::Detached,
generation: None,
secondary_conf: None,
shard_number: self.shard.number.0,
shard_count: self.shard.count.0,
shard_stripe_size: self.shard.stripe_size.0,
tenant_conf: self.config.clone(),
},
)
.await?;
}
}
}
Ok(())
}
}
/// Pageserver calls into this on startup, to learn which tenants it should attach
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
@@ -326,7 +174,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
if state.pageserver == Some(reattach_req.node_id) {
state.generation += 1;
response.tenants.push(ReAttachResponseTenant {
id: *t,
// TODO(sharding): make this shard-aware
id: TenantShardId::unsharded(*t),
gen: state.generation,
});
}
@@ -349,7 +198,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
};
for req_tenant in validate_req.tenants {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
// TODO(sharding): make this shard-aware
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
let valid = tenant_state.generation == req_tenant.gen;
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {})",
@@ -377,34 +227,30 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
let tenant_state = locked
.tenants
.entry(attach_req.tenant_shard_id)
.entry(attach_req.tenant_id)
.or_insert_with(|| TenantState {
tenant_shard_id: attach_req.tenant_shard_id,
pageserver: attach_req.node_id,
generation: 0,
shard: ShardIdentity::unsharded(),
observed: ObservedState::default(),
config: TenantConfig::default(),
});
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
tenant_state.generation += 1;
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
tenant_id = %attach_req.tenant_id,
ps_id = %attaching_pageserver,
generation = %tenant_state.generation,
"issuing",
);
} else if let Some(ps_id) = tenant_state.pageserver {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
tenant_id = %attach_req.tenant_id,
%ps_id,
generation = %tenant_state.generation,
"dropping",
);
} else {
tracing::info!(
tenant_id = %attach_req.tenant_shard_id,
tenant_id = %attach_req.tenant_id,
"no-op: tenant already has no pageserver");
}
tenant_state.pageserver = attach_req.node_id;
@@ -412,7 +258,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
tracing::info!(
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
attach_req.tenant_shard_id,
attach_req.tenant_id,
tenant_state.generation,
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
@@ -432,7 +278,7 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
let state = get_state(&req).inner.clone();
let locked = state.write().await;
let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id);
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
json_response(
StatusCode::OK,
@@ -442,510 +288,13 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
)
}
/// Scenarios in which we cannot find a suitable location for a tenant shard
#[derive(thiserror::Error, Debug)]
enum ScheduleError {
#[error("No pageservers found")]
NoPageservers,
}
impl From<ScheduleError> for ApiError {
fn from(value: ScheduleError) -> Self {
ApiError::Conflict(format!("Scheduling error: {}", value))
}
}
#[derive(thiserror::Error, Debug)]
enum ReconcileError {
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<ReconcileError> for ApiError {
fn from(value: ReconcileError) -> Self {
ApiError::Conflict(format!("Reconciliation error: {}", value))
}
}
struct Scheduler {
tenant_counts: HashMap<NodeId, usize>,
}
impl Scheduler {
fn new(persistent_state: &PersistentState) -> Self {
let mut tenant_counts = HashMap::new();
for node_id in persistent_state.pageservers.keys() {
tenant_counts.insert(*node_id, 0);
}
for tenant in persistent_state.tenants.values() {
if let Some(ps) = tenant.pageserver {
let entry = tenant_counts.entry(ps).or_insert(0);
*entry += 1;
}
}
Self { tenant_counts }
}
fn schedule_shard(&mut self) -> Result<NodeId, ScheduleError> {
if self.tenant_counts.is_empty() {
return Err(ScheduleError::NoPageservers);
}
let mut tenant_counts: Vec<(NodeId, usize)> =
self.tenant_counts.iter().map(|(k, v)| (*k, *v)).collect();
tenant_counts.sort_by_key(|i| i.1);
for (node_id, count) in &tenant_counts {
tracing::info!("tenant_counts[{node_id}]={count}");
}
let node_id = tenant_counts.first().unwrap().0;
tracing::info!("scheduler selected node {node_id}");
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
Ok(node_id)
}
}
async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
let state = get_state(&req).inner.clone();
let mut locked = state.write().await;
tracing::info!(
"Creating tenant {}, shard_count={:?}, have {} pageservers",
create_req.new_tenant_id,
create_req.shard_parameters.count,
locked.pageservers.len()
);
// This service expects to handle sharding itself: it is an error to try and directly create
// a particular shard here.
let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Attempted to create a specific shard, this API is for creating the whole tenant"
)));
} else {
create_req.new_tenant_id.tenant_id
};
// Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
1
} else {
create_req.shard_parameters.count.0
};
let mut response_shards = Vec::new();
let mut scheduler = Scheduler::new(&locked);
for i in 0..literal_shard_count {
let shard_number = ShardNumber(i);
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number,
shard_count: create_req.shard_parameters.count,
};
tracing::info!("Creating shard {tenant_shard_id}...");
use std::collections::btree_map::Entry;
match locked.tenants.entry(tenant_shard_id) {
Entry::Occupied(mut entry) => {
tracing::info!("Tenant shard {tenant_shard_id} already exists while creating");
if entry.get_mut().pageserver.is_none() {
entry.get_mut().pageserver = Some(scheduler.schedule_shard().map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?);
}
response_shards.push(TenantCreateResponseShard {
node_id: entry
.get()
.pageserver
.expect("We just set pageserver if it was None"),
generation: entry.get().generation,
});
continue;
}
Entry::Vacant(entry) => {
let state = TenantState {
tenant_shard_id,
pageserver: Some(scheduler.schedule_shard().map_err(|e| {
ApiError::Conflict(format!(
"Failed to schedule shard {tenant_shard_id}: {e}"
))
})?),
generation: create_req.generation.unwrap_or(1),
shard: ShardIdentity::from_params(shard_number, &create_req.shard_parameters),
observed: ObservedState::default(),
config: create_req.config.clone(),
};
response_shards.push(TenantCreateResponseShard {
node_id: state
.pageserver
.expect("We just set pageserver if it was None"),
generation: state.generation,
});
entry.insert(state)
}
};
}
// Take a snapshot of pageservers
let pageservers = locked.pageservers.clone();
for (tenant_shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
shard.reconcile(&pageservers).await.map_err(|e| {
ApiError::Conflict(format!(
"Failed to reconcile tenant shard {}: {}",
tenant_shard_id, e
))
})?;
}
locked.save().await.map_err(ApiError::InternalServerError)?;
json_response(
StatusCode::OK,
TenantCreateResponse {
shards: response_shards,
},
)
}
async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let mut create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
let state = get_state(&req).inner.clone();
let mut locked = state.write().await;
tracing::info!(
"Creating timeline {}/{}, have {} pageservers",
tenant_id,
create_req.new_timeline_id,
locked.pageservers.len()
);
let mut scheduler = Scheduler::new(&locked);
// Take a snapshot of pageservers
let pageservers = locked.pageservers.clone();
let mut timeline_info = None;
for (_tenant_shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
shard.schedule(&mut scheduler)?;
shard.reconcile(&pageservers).await?;
let node_id = shard.pageserver.expect("We just scheduled successfully");
let node = pageservers
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
let shard_timeline_info = shard
.timeline_create(node, &create_req)
.await
.map_err(|e| ApiError::Conflict(format!("Failed to create timeline: {e}")))?;
if timeline_info.is_none() {
// If the caller specified an ancestor but no ancestor LSN, we are responsible for
// propagating the LSN chosen by the first shard to the other shards: it is important
// that all shards end up with the same ancestor_start_lsn.
if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
{
create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
}
// We will return the TimelineInfo from the first shard
timeline_info = Some(shard_timeline_info);
}
}
json_response(StatusCode::OK, timeline_info)
}
async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let state = get_state(&req).inner.clone();
let mut locked = state.write().await;
tracing::info!("Locating shards for tenant {tenant_id}");
// Take a snapshot of pageservers
let pageservers = locked.pageservers.clone();
let mut result = Vec::new();
let mut shard_params: Option<ShardParameters> = None;
for (tenant_shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
let node_id = shard
.pageserver
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"Cannot locate a tenant that is not attached"
)))?;
let node = pageservers
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
result.push(TenantLocateResponseShard {
shard_id: *tenant_shard_id,
node_id,
listen_http_addr: node.listen_http_addr.clone(),
listen_http_port: node.listen_http_port,
listen_pg_addr: node.listen_pg_addr.clone(),
listen_pg_port: node.listen_pg_port,
});
match &shard_params {
None => {
shard_params = Some(ShardParameters {
stripe_size: Some(shard.shard.stripe_size),
count: shard.shard.count,
});
}
Some(params) => {
if params.stripe_size != Some(shard.shard.stripe_size) {
// This should never happen. We enforce at runtime because it's simpler than
// adding an extra per-tenant data structure to store the things that should be the same
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Inconsistent shard stripe size parameters!"
)));
}
}
}
}
if result.is_empty() {
return Err(ApiError::NotFound(
anyhow::anyhow!("No shards for this tenant ID found").into(),
));
}
let shard_params = shard_params.expect("result is non-empty, therefore this is set");
tracing::info!(
"Located tenant {} with params {:?} on shards {}",
tenant_id,
shard_params,
result
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
json_response(
StatusCode::OK,
TenantLocateResponse {
shards: result,
shard_params,
},
)
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
let state = get_state(&req).inner.clone();
let mut locked = state.write().await;
locked.pageservers.insert(
register_req.node_id,
NodeState {
id: register_req.node_id,
listen_http_addr: register_req.listen_http_addr,
listen_http_port: register_req.listen_http_port,
listen_pg_addr: register_req.listen_pg_addr,
listen_pg_port: register_req.listen_pg_port,
},
);
tracing::info!(
"Registered pageserver {}, now have {} pageservers",
register_req.node_id,
locked.pageservers.len()
);
json_response(StatusCode::OK, ())
}
async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
let state = get_state(&req).inner.clone();
let mut locked = state.write().await;
let pageservers = locked.pageservers.clone();
let mut replacements = HashMap::new();
for (tenant_shard_id, shard) in locked
.tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
{
if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
tracing::warn!(
"Tenant shard {} already has shard count {}",
tenant_shard_id,
split_req.new_shard_count
);
continue;
}
let node_id = shard
.pageserver
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"Cannot split a tenant that is not attached"
)))?;
let node = pageservers
.get(&node_id)
.expect("Pageservers may not be deleted while referenced");
let client = Client::new();
let response = client
.request(
Method::PUT,
format!("{}/tenant/{}/shard_split", node.base_url(), tenant_shard_id),
)
.json(&TenantShardSplitRequest {
new_shard_count: split_req.new_shard_count,
})
.send()
.await
.map_err(|e| {
ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
})?;
// response.error_for_status().map_err(|e| {
// ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
// })?;
response.error_for_status_ref().map_err(|e| {
ApiError::Conflict(format!("Failed to split {}: {}", tenant_shard_id, e))
})?;
let response: TenantShardSplitResponse = response.json().await.map_err(|e| {
ApiError::InternalServerError(anyhow::anyhow!(
"Malformed response from pageserver: {}",
e
))
})?;
tracing::info!(
"Split {} into {}",
tenant_shard_id,
response
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
replacements.insert(*tenant_shard_id, response.new_shards);
}
// Replace all the shards we just split with their children
let mut response = TenantShardSplitResponse {
new_shards: Vec::new(),
};
for (replaced, children) in replacements.into_iter() {
let (pageserver, generation, shard_ident, config) = {
let old_state = locked
.tenants
.remove(&replaced)
.expect("It was present, we just split it");
(
old_state.pageserver.unwrap(),
old_state.generation,
old_state.shard,
old_state.config.clone(),
)
};
locked.tenants.remove(&replaced);
for child in children {
let mut child_shard = shard_ident;
child_shard.number = child.shard_number;
child_shard.count = child.shard_count;
let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
child_observed.insert(
pageserver,
ObservedStateLocation {
conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: Some(generation),
secondary_conf: None,
shard_number: child.shard_number.0,
shard_count: child.shard_count.0,
shard_stripe_size: shard_ident.stripe_size.0,
tenant_conf: config.clone(),
}),
},
);
locked.tenants.insert(
child,
TenantState {
tenant_shard_id: child,
shard: child_shard,
pageserver: Some(pageserver),
generation,
observed: ObservedState {
locations: child_observed,
},
config: config.clone(),
},
);
response.new_shards.push(child);
}
}
locked.save().await.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, response)
}
/// Status endpoint is just used for checking that our HTTP listener is up
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(StatusCode::OK, ())
}
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router()
.data(Arc::new(State::new(persistent_state)))
.get("/status", |r| request_span(r, handle_status))
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
.post("/inspect", |r| request_span(r, handle_inspect))
.post("/node", |r| request_span(r, handle_node_register))
.post("/tenant", |r| request_span(r, handle_tenant_create))
.post("/tenant/:tenant_id/timeline", |r| {
request_span(r, handle_tenant_timeline_create)
})
.get("/tenant/:tenant_id/locate", |r| {
request_span(r, handle_tenant_locate)
})
.put("/tenant/:tenant_id/shard_split", |r| {
request_span(r, handle_tenant_shard_split)
})
}
#[tokio::main]

View File

@@ -15,10 +15,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::tenant_migration::migrate_tenant;
use control_plane::{broker, local_env};
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
use pageserver_api::models::TimelineInfo;
use pageserver_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -33,7 +30,6 @@ use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use url::Host;
use utils::{
auth::{Claims, Scope},
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -280,10 +276,10 @@ fn print_timeline(
/// Connects to the pageserver to query this information.
async fn get_timeline_infos(
env: &local_env::LocalEnv,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<HashMap<TimelineId, TimelineInfo>> {
Ok(get_default_pageserver(env)
.timeline_list(tenant_shard_id)
.timeline_list(tenant_id)
.await?
.into_iter()
.map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -301,20 +297,6 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
}
}
// Helper function to parse --tenant_id option, for commands that accept a shard suffix
fn get_tenant_shard_id(
sub_match: &ArgMatches,
env: &local_env::LocalEnv,
) -> anyhow::Result<TenantShardId> {
if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
tenant_id_from_arguments
} else if let Some(default_id) = env.default_tenant_id {
Ok(TenantShardId::unsharded(default_id))
} else {
anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
}
}
fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
sub_match
.get_one::<String>("tenant-id")
@@ -323,14 +305,6 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
.context("Failed to parse tenant id from the argument string")
}
fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
sub_match
.get_one::<String>("tenant-id")
.map(|id_str| TenantShardId::from_str(id_str))
.transpose()
.context("Failed to parse tenant shard id from the argument string")
}
fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
sub_match
.get_one::<String>("timeline-id")
@@ -419,66 +393,47 @@ async fn handle_tenant(
Some(("create", create_match)) => {
let tenant_conf: HashMap<_, _> = create_match
.get_many::<String>("config")
.map(|vals: clap::parser::ValuesRef<'_, String>| {
vals.flat_map(|c| c.split_once(':')).collect()
})
.map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
.unwrap_or_default();
let shard_count: u8 = create_match
.get_one::<u8>("shard-count")
.cloned()
.unwrap_or(0);
let shard_stripe_size: Option<u32> =
create_match.get_one::<u32>("shard-stripe-size").cloned();
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
// If tenant ID was not specified, generate one
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
// We must register the tenant with the attachment service, so
// that when the pageserver restarts, it will be re-attached.
let attachment_service = AttachmentService::from_env(env);
attachment_service
.tenant_create(TenantCreateRequest {
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters {
count: ShardCount(shard_count),
stripe_size: shard_stripe_size.map(ShardStripeSize),
},
config: tenant_conf,
})
let generation = if env.control_plane_api.is_some() {
// We must register the tenant with the attachment service, so
// that when the pageserver restarts, it will be re-attached.
let attachment_service = AttachmentService::from_env(env);
attachment_service
.attach_hook(tenant_id, pageserver.conf.id)
.await?
} else {
None
};
pageserver
.tenant_create(tenant_id, generation, tenant_conf)
.await?;
println!("tenant {tenant_id} successfully created on the pageserver");
// Create an initial timeline for the new tenant
let new_timeline_id =
parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
let new_timeline_id = parse_timeline_id(create_match)?;
let pg_version = create_match
.get_one::<u32>("pg-version")
.copied()
.context("Failed to parse postgres version from the argument string")?;
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
// different shards picking different start lsns. Maybe we have to teach attachment service
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
attachment_service
.tenant_timeline_create(
let timeline_info = pageserver
.timeline_create(
tenant_id,
TimelineCreateRequest {
new_timeline_id,
ancestor_timeline_id: None,
ancestor_start_lsn: None,
existing_initdb_timeline_id: None,
pg_version: Some(pg_version),
},
new_timeline_id,
None,
None,
Some(pg_version),
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
env.register_branch_mapping(
DEFAULT_BRANCH_NAME.to_string(),
@@ -486,7 +441,9 @@ async fn handle_tenant(
new_timeline_id,
)?;
println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);
println!(
"Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
);
if create_match.get_flag("set-default") {
println!("Setting tenant {tenant_id} as a default one");
@@ -513,92 +470,12 @@ async fn handle_tenant(
println!("tenant {tenant_id} successfully configured on the pageserver");
}
Some(("migrate", matches)) => {
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
let tenant_id = get_tenant_id(matches, env)?;
let new_pageserver = get_pageserver(env, matches)?;
let new_pageserver_id = new_pageserver.conf.id;
migrate_tenant(env, tenant_shard_id, new_pageserver).await?;
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
}
Some(("split", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let attachment_service = AttachmentService::from_env(env);
let old_shards = attachment_service.tenant_locate(tenant_id).await?.shards;
let new_shard_count = old_shards.len() * 2;
if old_shards.len() > 127 {
bail!("Cannot split further");
}
attachment_service
.tenant_split(tenant_id, new_shard_count as u8)
.await?;
println!("Split {}->{}", old_shards.len(), new_shard_count);
}
Some(("status", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let mut shard_table = comfy_table::Table::new();
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
let mut tenant_synthetic_size = None;
let attachment_service = AttachmentService::from_env(env);
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
let size = pageserver
.http_client
.tenant_details(shard.shard_id)
.await?
.tenant_info
.current_physical_size
.unwrap();
shard_table.add_row([
format!("{}", shard.shard_id.shard_slug()),
format!("{}", shard.node_id.0),
format!("{} MiB", size / (1024 * 1024)),
]);
if shard.shard_id.is_zero() {
tenant_synthetic_size =
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
}
}
let Some(synthetic_size) = tenant_synthetic_size else {
bail!("Shard 0 not found")
};
let mut tenant_table = comfy_table::Table::new();
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
tenant_table.add_row([
"Synthetic size".to_string(),
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
]);
println!("{tenant_table}");
println!("{shard_table}");
}
Some(("shard-split", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
let attachment_service = AttachmentService::from_env(env);
let result = attachment_service
.tenant_split(tenant_id, shard_count)
.await?;
println!(
"Split tenant {} into shards {}",
tenant_id,
result
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
migrate_tenant(env, tenant_id, new_pageserver).await?;
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
@@ -612,10 +489,8 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
match timeline_match.subcommand() {
Some(("list", list_match)) => {
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
// where shard 0 is attached, and query there.
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
let tenant_id = get_tenant_id(list_match, env)?;
let timelines = pageserver.timeline_list(&tenant_id).await?;
print_timelines_tree(timelines, env.timeline_name_mappings())?;
}
Some(("create", create_match)) => {
@@ -630,19 +505,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
.context("Failed to parse postgres version from the argument string")?;
let new_timeline_id_opt = parse_timeline_id(create_match)?;
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
let attachment_service = AttachmentService::from_env(env);
let create_req = TimelineCreateRequest {
new_timeline_id,
ancestor_timeline_id: None,
existing_initdb_timeline_id: None,
ancestor_start_lsn: None,
pg_version: Some(pg_version),
};
let timeline_info = attachment_service
.tenant_timeline_create(tenant_id, create_req)
let timeline_info = pageserver
.timeline_create(
tenant_id,
new_timeline_id_opt,
None,
None,
Some(pg_version),
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
@@ -700,6 +574,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
None,
pg_version,
ComputeMode::Primary,
DEFAULT_PAGESERVER_ID,
)?;
println!("Done");
}
@@ -723,18 +598,17 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse ancestor start Lsn from the request")?;
let new_timeline_id = TimelineId::generate();
let attachment_service = AttachmentService::from_env(env);
let create_req = TimelineCreateRequest {
new_timeline_id,
ancestor_timeline_id: Some(ancestor_timeline_id),
existing_initdb_timeline_id: None,
ancestor_start_lsn: start_lsn,
pg_version: None,
};
let timeline_info = attachment_service
.tenant_timeline_create(tenant_id, create_req)
let timeline_info = pageserver
.timeline_create(
tenant_id,
None,
start_lsn,
Some(ancestor_timeline_id),
None,
None,
)
.await?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -761,10 +635,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
match sub_name {
"list" => {
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
// where shard 0 is attached, and query there.
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
let tenant_id = get_tenant_id(sub_args, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_id)
.await
.unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
@@ -789,7 +661,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
for (endpoint_id, endpoint) in cplane
.endpoints
.iter()
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
.filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
{
let lsn_str = match endpoint.mode {
ComputeMode::Static(lsn) => {
@@ -808,10 +680,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
};
let branch_name = timeline_name_mappings
.get(&TenantTimelineId::new(
tenant_shard_id.tenant_id,
endpoint.timeline_id,
))
.get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
.map(|name| name.as_str())
.unwrap_or("?");
@@ -859,6 +728,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.copied()
.unwrap_or(false);
let pageserver_id =
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica,
@@ -886,6 +762,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
http_port,
pg_version,
mode,
pageserver_id,
)?;
}
"start" => {
@@ -928,22 +805,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
endpoint.timeline_id,
)?;
let attachment_service = AttachmentService::from_env(env);
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
let pageservers = locate_result
.shards
.into_iter()
.map(|shard| {
(
Host::parse(&shard.listen_pg_addr)
.expect("Attachment service reported bad hostname"),
shard.listen_pg_port,
)
})
.collect::<Vec<_>>();
assert!(!pageservers.is_empty());
let stripe_size = locate_result.shard_params.stripe_size.map(|s| s.0 as usize);
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
@@ -955,13 +816,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
println!("Starting existing endpoint {endpoint_id}...");
endpoint
.start(
&auth_token,
safekeepers,
pageservers,
remote_ext_config,
stripe_size,
)
.start(&auth_token, safekeepers, remote_ext_config)
.await?;
}
"reconfigure" => {
@@ -972,31 +827,15 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
.endpoints
.get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let pageservers =
let pageserver_id =
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
vec![(
pageserver.pg_connection_config.host().clone(),
pageserver.pg_connection_config.port(),
)]
Some(NodeId(
id_str.parse().context("while parsing pageserver id")?,
))
} else {
let attachment_service = AttachmentService::from_env(env);
attachment_service
.tenant_locate(endpoint.tenant_id)
.await?
.shards
.into_iter()
.map(|shard| {
(
Host::parse(&shard.listen_pg_addr)
.expect("Attachment service reported malformed host"),
shard.listen_pg_port,
)
})
.collect::<Vec<_>>()
None
};
endpoint.reconfigure(pageservers).await?;
endpoint.reconfigure(pageserver_id).await?;
}
"stop" => {
let endpoint_id = sub_args
@@ -1513,8 +1352,6 @@ fn cli() -> Command {
.arg(pg_version_arg.clone())
.arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
)
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
@@ -1525,14 +1362,6 @@ fn cli() -> Command {
.about("Migrate a tenant from one pageserver to another")
.arg(tenant_id_arg.clone())
.arg(pageserver_id_arg.clone()))
.subcommand(Command::new("status")
.about("Human readable summary of the tenant's shards and attachment locations")
.arg(tenant_id_arg.clone()))
.subcommand(Command::new("shard-split")
.about("Increase the number of shards in the tenant")
.arg(tenant_id_arg.clone())
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
)
)
.subcommand(
Command::new("pageserver")

View File

@@ -47,11 +47,10 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::spec::RemoteExtSpec;
use serde::{Deserialize, Serialize};
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
use crate::attachment_service::AttachmentService;
use crate::local_env::LocalEnv;
use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf;
use compute_api::responses::{ComputeState, ComputeStatus};
@@ -68,6 +67,7 @@ pub struct EndpointConf {
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
pageserver_id: NodeId,
}
//
@@ -119,14 +119,19 @@ impl ComputeControlPlane {
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
pageserver_id: NodeId,
) -> Result<Arc<Endpoint>> {
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
env: self.env.clone(),
pageserver,
timeline_id,
mode,
tenant_id,
@@ -152,6 +157,7 @@ impl ComputeControlPlane {
pg_port,
pg_version,
skip_pg_catalog_updates: true,
pageserver_id,
})?,
)?;
std::fs::write(
@@ -210,6 +216,7 @@ pub struct Endpoint {
// These are not part of the endpoint as such, but the environment
// the endpoint runs in.
pub env: LocalEnv,
pageserver: PageServerNode,
// Optimizations
skip_pg_catalog_updates: bool,
@@ -232,11 +239,15 @@ impl Endpoint {
let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
Ok(Endpoint {
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
endpoint_id,
env: env.clone(),
pageserver,
timeline_id: conf.timeline_id,
mode: conf.mode,
tenant_id: conf.tenant_id,
@@ -453,21 +464,11 @@ impl Endpoint {
}
}
fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
pageservers
.iter()
.map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
.collect::<Vec<_>>()
.join(",")
}
pub async fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
pageservers: Vec<(Host, u16)>,
remote_ext_config: Option<&String>,
shard_stripe_size: Option<usize>,
) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
@@ -481,9 +482,13 @@ impl Endpoint {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
assert!(!pageserver_connstring.is_empty());
let pageserver_connstring = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
let mut safekeeper_connstrings = Vec::new();
if self.mode == ComputeMode::Primary {
for sk_id in safekeepers {
@@ -532,7 +537,6 @@ impl Endpoint {
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions,
shard_stripe_size,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -545,11 +549,8 @@ impl Endpoint {
// Launch compute_ctl
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new("/usr/bin/taskset");
cmd.args(["-c".to_string(), "8-11".to_string()])
.args([self.env.neon_distrib_dir.join("compute_ctl")])
.args(["--http-port", &self.http_address.port().to_string()])
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &self.connstr()])
.args([
@@ -658,7 +659,7 @@ impl Endpoint {
}
}
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
let mut spec: ComputeSpec = {
let spec_path = self.endpoint_path().join("spec.json");
let file = std::fs::File::open(spec_path)?;
@@ -668,26 +669,24 @@ impl Endpoint {
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
// If we weren't given explicit pageservers, query the attachment service
if pageservers.is_empty() {
let attachment_service = AttachmentService::from_env(&self.env);
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
pageservers = locate_result
.shards
.into_iter()
.map(|shard| {
(
Host::parse(&shard.listen_pg_addr)
.expect("Attachment service reported bad hostname"),
shard.listen_pg_port,
)
})
.collect::<Vec<_>>();
}
if let Some(pageserver_id) = pageserver_id {
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
let mut endpoint_conf: EndpointConf = {
let file = std::fs::File::open(&endpoint_config_path)?;
serde_json::from_reader(file)?
};
endpoint_conf.pageserver_id = pageserver_id;
std::fs::write(
endpoint_config_path,
serde_json::to_string_pretty(&endpoint_conf)?,
)?;
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
assert!(!pageserver_connstr.is_empty());
spec.pageserver_connstring = Some(pageserver_connstr);
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ps_http_conf = &pageserver.pg_connection_config;
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
}
let client = reqwest::Client::new();
let response = client

View File

@@ -1,6 +1,5 @@
//! Code to manage pageservers
//!
//!
//! In the local test environment, the pageserver stores its data directly in
//!
//! .neon/
@@ -13,15 +12,12 @@ use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::str::FromStr;
use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::models::{
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
};
use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
@@ -220,19 +216,11 @@ impl PageServerNode {
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
let mut taskset_args = vec![
"-c".to_string(),
format!("{}", self.conf.id.0 - 1),
self.env.pageserver_bin().to_string_lossy().into(),
];
taskset_args.extend(args.into_iter().map(|a| a.to_string()));
background_process::start_process(
"pageserver",
&datadir,
&PathBuf::from_str("/usr/bin/taskset").unwrap(),
taskset_args,
&self.env.pageserver_bin(),
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
@@ -313,8 +301,16 @@ impl PageServerNode {
pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
self.http_client.list_tenants().await
}
pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
let result = models::TenantConfig {
pub async fn tenant_create(
&self,
new_tenant_id: TenantId,
generation: Option<u32>,
settings: HashMap<&str, &str>,
) -> anyhow::Result<TenantId> {
let mut settings = settings.clone();
let config = models::TenantConfig {
checkpoint_distance: settings
.remove("checkpoint_distance")
.map(|x| x.parse::<u64>())
@@ -375,26 +371,11 @@ impl PageServerNode {
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
} else {
Ok(result)
}
}
pub async fn tenant_create(
&self,
new_tenant_id: TenantId,
generation: Option<u32>,
settings: HashMap<&str, &str>,
) -> anyhow::Result<TenantId> {
let config = Self::parse_config(settings.clone())?;
let request = models::TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
generation,
config,
shard_parameters: ShardParameters::default(),
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -490,32 +471,31 @@ impl PageServerNode {
pub async fn location_config(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<Duration>,
) -> anyhow::Result<()> {
Ok(self
.http_client
.location_config(tenant_shard_id, config, flush_ms)
.location_config(tenant_id, config, flush_ms)
.await?)
}
pub async fn timeline_list(
&self,
tenant_shard_id: &TenantShardId,
) -> anyhow::Result<Vec<TimelineInfo>> {
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
Ok(self.http_client.list_timelines(*tenant_id).await?)
}
pub async fn timeline_create(
&self,
tenant_id: TenantId,
new_timeline_id: TimelineId,
new_timeline_id: Option<TimelineId>,
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
existing_initdb_timeline_id: Option<TimelineId>,
) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
let req = models::TimelineCreateRequest {
new_timeline_id,
ancestor_start_lsn,
@@ -601,14 +581,4 @@ impl PageServerNode {
Ok(())
}
pub async fn tenant_synthetic_size(
&self,
tenant_shard_id: TenantShardId,
) -> anyhow::Result<TenantHistorySize> {
Ok(self
.http_client
.tenant_synthetic_size(tenant_shard_id)
.await?)
}
}

View File

@@ -11,17 +11,19 @@ use crate::{
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
use pageserver_api::shard::TenantShardId;
use std::collections::HashMap;
use std::time::Duration;
use utils::{id::TimelineId, lsn::Lsn};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
/// Given an attached pageserver, retrieve the LSN for all timelines
async fn get_lsns(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
pageserver: &PageServerNode,
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
let timelines = pageserver.timeline_list(&tenant_id).await?;
Ok(timelines
.into_iter()
.map(|t| (t.timeline_id, t.last_record_lsn))
@@ -31,12 +33,12 @@ async fn get_lsns(
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
/// `baseline`.
async fn await_lsn(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
pageserver: &PageServerNode,
baseline: HashMap<TimelineId, Lsn>,
) -> anyhow::Result<()> {
loop {
let latest = match get_lsns(tenant_shard_id, pageserver).await {
let latest = match get_lsns(tenant_id, pageserver).await {
Ok(l) => l,
Err(e) => {
println!(
@@ -84,7 +86,7 @@ async fn await_lsn(
/// - reconfigure compute endpoints to point to new attached pageserver
pub async fn migrate_tenant(
env: &LocalEnv,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
dest_ps: PageServerNode,
) -> anyhow::Result<()> {
// Get a new generation
@@ -106,7 +108,7 @@ pub async fn migrate_tenant(
}
}
let previous = attachment_service.inspect(tenant_shard_id).await?;
let previous = attachment_service.inspect(tenant_id).await?;
let mut baseline_lsns = None;
if let Some((generation, origin_ps_id)) = &previous {
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
@@ -114,12 +116,10 @@ pub async fn migrate_tenant(
if origin_ps_id == &dest_ps.conf.id {
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service
.attach_hook(tenant_shard_id, dest_ps.conf.id)
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps
.location_config(tenant_shard_id, dest_conf, None)
.await?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
println!("✅ Migration complete");
return Ok(());
}
@@ -129,35 +129,33 @@ pub async fn migrate_tenant(
let stale_conf =
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
origin_ps
.location_config(tenant_shard_id, stale_conf, Some(Duration::from_secs(10)))
.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
.await?;
baseline_lsns = Some(get_lsns(tenant_shard_id, &origin_ps).await?);
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
}
let gen = attachment_service
.attach_hook(tenant_shard_id, dest_ps.conf.id)
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps
.location_config(tenant_shard_id, dest_conf, None)
.await?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
if let Some(baseline) = baseline_lsns {
println!("🕑 Waiting for LSN to catch up...");
await_lsn(tenant_shard_id, &dest_ps, baseline).await?;
await_lsn(tenant_id, &dest_ps, baseline).await?;
}
let cplane = ComputeControlPlane::load(env.clone())?;
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_shard_id.tenant_id && endpoint.status() == "running" {
if endpoint.tenant_id == tenant_id {
println!(
"🔁 Reconfiguring endpoint {} to use pageserver {}",
endpoint_name, dest_ps.conf.id
);
endpoint.reconfigure(vec![]).await?;
endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
}
}
@@ -173,29 +171,24 @@ pub async fn migrate_tenant(
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i == tenant_shard_id);
.any(|i| i.tenant_id == tenant_id);
if !found {
continue;
}
// // Downgrade to a secondary location
// let secondary_conf = build_location_config(
// LocationConfigMode::Secondary,
// None,
// Some(LocationConfigSecondary { warm: true }),
// );
// Downgrade to a secondary location
let secondary_conf = build_location_config(
LocationConfigMode::Secondary,
None,
Some(LocationConfigSecondary { warm: true }),
);
// println!(
// "💤 Switching to secondary mode on pageserver {}",
// other_ps.conf.id
// );
// other_ps
// .location_config(tenant_shard_id, secondary_conf, None)
// .await?;
let detached_conf = build_location_config(LocationConfigMode::Detached, None, None);
println!("💤 Detaching on pageserver {}", other_ps.conf.id);
println!(
"💤 Switching to secondary mode on pageserver {}",
other_ps.conf.id
);
other_ps
.location_config(tenant_shard_id, detached_conf, None)
.location_config(tenant_id, secondary_conf, None)
.await?;
}
@@ -204,9 +197,7 @@ pub async fn migrate_tenant(
dest_ps.conf.id
);
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps
.location_config(tenant_shard_id, dest_conf, None)
.await?;
dest_ps.location_config(tenant_id, dest_conf, None).await?;
println!("✅ Migration complete");

View File

@@ -1,61 +0,0 @@
set -u
export RUST_LOG=INFO
INITIAL_SHARDS="${INITIAL_SHARDS:-1}"
PAGESERVERS=8
FINAL_SHARDS=8
STRIPE_SIZE=128
SCALE=100
export BUILD_ARGS=--release
ARGS="${BUILD_ARGS} -q"
RUST_LOG=info
TENANT_ID=1f359dd625e519a1a4e8d7509690f6fc
set -e
set -x
set +e
cargo neon $ARGS stop ; killall -9 storage_broker ; killall -9 safekeeper ; killall -9 pageserver ; killall -9 postgres ; killall -9 attachment_service ; rm -rf .neon
set -e
cargo build $ARGS --features=testing
cargo neon $ARGS init --num-pageservers=$PAGESERVERS && RUST_LOG=$RUST_LOG cargo neon $ARGS start && cargo neon $ARGS tenant create --shard-count=$INITIAL_SHARDS --shard-stripe-size=$STRIPE_SIZE --tenant-id=$TENANT_ID --timeline-id=3d34095be52fec4c44a92e774c573b57 --set-default
cargo neon $ARGS endpoint create && cargo neon $ARGS endpoint start ep-main
pgbench postgres -i -h 127.0.0.1 -p 55432 -U cloud_admin -s $SCALE
cargo neon $ARGS tenant status
# pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 600 -P 1 -c 32
#
# tmux
#Ctrl+b+% horizontal split
#Ctrl+b-o toggle panes
#alias neon="cargo neon --release -q"
# Pt1: baseline: one pageserver
#INITIAL_SHARDS=1 bash demo_sharding.sh
#neon tenant status
#taskset -c 12-15 pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 30 -P 1 -c 64
#taskset -c 12-15 pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 30 -P 1 -c 64 -S
# Pt2: four shards
#INITIAL_SHARDS=4 bash demo_sharding.sh
#neon tenant status
#taskset -c 12-15 pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 30 -P 1 -c 64
#taskset -c 12-15 pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 30 -P 1 -c 64 -S
# Pt3: 8 shards
#bash demo_split_8.sh
#taskset -c 12-15 pgbench postgres -h 127.0.0.1 -p 55432 -U cloud_admin -T 30 -P 1 -c 64 -S

View File

@@ -1,16 +0,0 @@
FINAL_SHARDS=4
TENANT_ID=1f359dd625e519a1a4e8d7509690f6fc
ARGS=--release -q
cargo neon $ARGS endpoint stop ep-main
cargo neon $ARGS tenant shard-split --shard-count=$FINAL_SHARDS
cargo neon $ARGS tenant status
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0004 --id=1
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0104 --id=2
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0204 --id=3
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0304 --id=4
cargo neon $ARGS tenant status
cargo neon $ARGS endpoint start ep-main

View File

@@ -1,20 +0,0 @@
FINAL_SHARDS=8
TENANT_ID=1f359dd625e519a1a4e8d7509690f6fc
ARGS="--release -q"
cargo neon $ARGS endpoint stop ep-main
cargo neon $ARGS tenant shard-split --shard-count=$FINAL_SHARDS
cargo neon $ARGS tenant status
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0008 --id=1
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0108 --id=2
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0208 --id=3
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0308 --id=4
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0408 --id=5
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0508 --id=6
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0608 --id=7
cargo neon $ARGS tenant migrate --tenant-id=$TENANT_ID-0708 --id=8
cargo neon $ARGS tenant status
cargo neon $ARGS endpoint start ep-main

View File

@@ -73,9 +73,6 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
// Stripe size for pageserver sharding, in pages
pub shard_stripe_size: Option<usize>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.

View File

@@ -18,10 +18,7 @@ use utils::{
lsn::Lsn,
};
use crate::{
reltag::RelTag,
shard::{ShardCount, ShardStripeSize, TenantShardId},
};
use crate::{reltag::RelTag, shard::TenantShardId};
use anyhow::bail;
use bytes::{Buf, BufMut, Bytes, BytesMut};
@@ -191,41 +188,6 @@ pub struct TimelineCreateRequest {
pub pg_version: Option<u32>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantShardSplitRequest {
pub new_shard_count: u8,
}
#[derive(Serialize, Deserialize)]
pub struct TenantShardSplitResponse {
pub new_shards: Vec<TenantShardId>,
}
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct ShardParameters {
pub count: ShardCount,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub stripe_size: Option<ShardStripeSize>,
}
impl ShardParameters {
pub fn is_unsharded(&self) -> bool {
self.count == ShardCount(0)
}
}
impl Default for ShardParameters {
fn default() -> Self {
Self {
count: ShardCount(0),
stripe_size: None,
}
}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantCreateRequest {
@@ -233,12 +195,6 @@ pub struct TenantCreateRequest {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
// If omitted, create a single shard with TenantShardId::unsharded()
#[serde(default)]
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
pub shard_parameters: ShardParameters,
#[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
@@ -261,7 +217,7 @@ impl std::ops::Deref for TenantCreateRequest {
/// An alternative representation of `pageserver::tenant::TenantConf` with
/// simpler types.
#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq, Clone)]
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantConfig {
pub checkpoint_distance: Option<u64>,
pub checkpoint_timeout: Option<String>,
@@ -290,7 +246,7 @@ pub struct TenantConfig {
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
/// lists out all possible states (and the virtual "Detached" state)
/// in a flat form rather than using rust-style enums.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub enum LocationConfigMode {
AttachedSingle,
AttachedMulti,
@@ -299,14 +255,14 @@ pub enum LocationConfigMode {
Detached,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct LocationConfigSecondary {
pub warm: bool,
}
/// An alternative representation of `pageserver::tenant::LocationConf`,
/// for use in external-facing APIs.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Serialize, Deserialize, Debug)]
pub struct LocationConfig {
pub mode: LocationConfigMode,
/// If attaching, in what generation?
@@ -341,7 +297,7 @@ pub struct StatusResponse {
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
pub tenant_shard_id: TenantShardId,
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
@@ -715,22 +671,6 @@ pub struct PagestreamDbSizeResponse {
pub db_size: i64,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantPhysicalSizeResponse {
pub size: u64,
}
// XXX hack: this is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
// that require pageserver-internal types. It is sufficient to get the total size.
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantHistorySize {
pub id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes.
///
/// Will be none if `?inputs_only=true` was given.
pub size: Option<u64>,
}
impl PagestreamFeMessage {
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();

View File

@@ -1,9 +1,6 @@
use std::{ops::RangeInclusive, str::FromStr};
use crate::{
key::{is_rel_block_key, Key},
models::ShardParameters,
};
use crate::key::{is_rel_block_key, Key};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
@@ -84,20 +81,6 @@ impl TenantShardId {
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
/// is useful when logging from code that is already in a span that includes tenant ID, to
/// keep messages reasonably terse.
pub fn to_index(&self) -> ShardIndex {
ShardIndex {
shard_number: self.shard_number,
shard_count: self.shard_count,
}
}
}
/// Formatting helper
@@ -176,7 +159,7 @@ impl From<[u8; 18]> for TenantShardId {
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
@@ -346,7 +329,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
stripe_size: ShardStripeSize,
layout: ShardLayout,
}
@@ -416,17 +399,6 @@ impl ShardIdentity {
}
}
/// For use when creating ShardIdentity instances for new shards, where a creation request
/// specifies the ShardParameters that apply to all shards.
pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
Self {
number,
count: params.count,
layout: LAYOUT_V1,
stripe_size: params.stripe_size.unwrap_or(DEFAULT_STRIPE_SIZE),
}
}
fn is_broken(&self) -> bool {
self.layout == LAYOUT_BROKEN
}

View File

@@ -16,7 +16,6 @@ use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
profile::ProfileFileCredentialsProvider,
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -75,29 +74,20 @@ impl S3Bucket {
let region = Some(Region::new(aws_config.bucket_region.clone()));
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else(
"token",
.or_else("token", {
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
.build()
})
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
@@ -228,6 +218,14 @@ impl S3Bucket {
let started_at = ScopeGuard::into_inner(started_at);
if get_object.is_err() {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
}
match get_object {
Ok(object_output) => {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
@@ -243,27 +241,11 @@ impl S3Bucket {
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
Err(DownloadError::NotFound)
}
Err(e) => {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
))
}
Err(e) => Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
)),
}
}
}

View File

@@ -366,49 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
}
}
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
///
/// This is used by the `pagebench` pageserver benchmarking tool.
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
impl rand::distributions::uniform::SampleUniform for Lsn {
type Sampler = LsnSampler;
}
impl rand::distributions::uniform::UniformSampler for LsnSampler {
type X = Lsn;
fn new<B1, B2>(low: B1, high: B2) -> Self
where
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
{
Self(
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
low.borrow().0,
high.borrow().0,
),
)
}
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
where
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
{
Self(
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
low.borrow().0,
high.borrow().0,
),
)
}
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
Lsn(self.0.sample(rng))
}
}
#[cfg(test)]
mod tests {
use crate::bin_ser::BeSer;

View File

@@ -1,12 +1,10 @@
use pageserver_api::{models::*, shard::TenantShardId};
use pageserver_api::models::*;
use reqwest::{IntoUrl, Method};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
};
pub mod util;
#[derive(Debug)]
pub struct Client {
mgmt_api_endpoint: String,
@@ -68,9 +66,9 @@ impl Client {
pub async fn tenant_details(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
) -> Result<pageserver_api::models::TenantDetails> {
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
self.get(uri)
.await?
.json()
@@ -80,12 +78,9 @@ impl Client {
pub async fn list_timelines(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline",
self.mgmt_api_endpoint
);
let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
self.get(&uri)
.await?
.json()
@@ -169,17 +164,14 @@ impl Client {
pub async fn location_config(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<std::time::Duration>,
) -> Result<()> {
let req_body = TenantLocationConfigRequest {
tenant_shard_id,
config,
};
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/v1/tenant/{}/location_config",
self.mgmt_api_endpoint, tenant_shard_id
self.mgmt_api_endpoint, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
@@ -205,19 +197,4 @@ impl Client {
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_synthetic_size(
&self,
tenant_shard_id: TenantShardId,
) -> Result<TenantHistorySize> {
let uri = format!(
"{}/v1/tenant/{}/synthetic_size",
self.mgmt_api_endpoint, tenant_shard_id
);
self.get(&uri)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -1,53 +0,0 @@
//! Helpers to do common higher-level tasks with the [`Client`].
use std::sync::Arc;
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use utils::id::{TenantId, TenantTimelineId};
use super::Client;
/// Retrieve a list of all of the pageserver's timelines.
///
/// Fails if there are sharded tenants present on the pageserver.
pub async fn get_pageserver_tenant_timelines_unsharded(
api_client: &Arc<Client>,
) -> anyhow::Result<Vec<TenantTimelineId>> {
let mut timelines: Vec<TenantTimelineId> = Vec::new();
let mut tenants: Vec<TenantId> = Vec::new();
for ti in api_client.list_tenants().await? {
if !ti.id.is_unsharded() {
anyhow::bail!(
"only unsharded tenants are supported at this time: {}",
ti.id
);
}
tenants.push(ti.id.tenant_id)
}
let mut js = JoinSet::new();
for tenant_id in tenants {
js.spawn({
let mgmt_api_client = Arc::clone(api_client);
async move {
(
tenant_id,
mgmt_api_client
.tenant_details(TenantShardId::unsharded(tenant_id))
.await
.unwrap(),
)
}
});
}
while let Some(res) = js.join_next().await {
let (tenant_id, details) = res.unwrap();
for timeline_id in details.timelines {
timelines.push(TenantTimelineId {
tenant_id,
timeline_id,
});
}
}
Ok(timelines)
}

View File

@@ -1,26 +0,0 @@
[package]
name = "pagebench"
version = "0.1.0"
edition.workspace = true
license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow.workspace = true
clap.workspace = true
futures.workspace = true
hdrhistogram.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
rand.workspace = true
serde.workspace = true
serde_json.workspace = true
tracing.workspace = true
tokio.workspace = true
pageserver = { path = ".." }
pageserver_client.workspace = true
pageserver_api.workspace = true
utils = { path = "../../libs/utils/" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,272 +0,0 @@
use anyhow::Context;
use pageserver_client::page_service::BasebackupRequest;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{debug, info, instrument};
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::ops::Range;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Instant;
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
/// basebackup@LatestLSN
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "localhost:64000")]
page_service_host_port: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
#[clap(long, default_value = "1.0")]
gzip_probability: f64,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
}
impl LiveStats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
struct Target {
timeline: TenantTimelineId,
lsn_range: Option<Range<Lsn>>,
}
#[derive(serde::Serialize)]
struct Output {
total: request_stats::Output,
}
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let timeline = *timeline;
// FIXME: this triggers initial logical size calculation
// https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
async move {
anyhow::Ok(Target {
timeline,
// TODO: support lsn_range != latest LSN
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
})
}
});
}
let mut all_targets: Vec<Target> = Vec::new();
while let Some(res) = js.join_next().await {
all_targets.push(res.unwrap().unwrap());
}
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
));
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
tokio::spawn({
let stats = Arc::clone(&live_stats);
let start_work_barrier = Arc::clone(&start_work_barrier);
async move {
start_work_barrier.wait().await;
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
info!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
)));
}
let work_sender = async move {
start_work_barrier.wait().await;
loop {
let (timeline, work) = {
let mut rng = rand::thread_rng();
let target = all_targets.choose(&mut rng).unwrap();
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
(
target.timeline,
Work {
lsn,
gzip: rng.gen_bool(args.gzip_probability),
},
)
};
let sender = work_senders.get(&timeline).unwrap();
// TODO: what if this blocks?
sender.send(work).await.ok().unwrap();
}
};
if let Some(runtime) = args.runtime {
match tokio::time::timeout(runtime.into(), work_sender).await {
Ok(()) => unreachable!("work sender never terminates"),
Err(_timeout) => {
// this implicitly drops the work_senders, making all the clients exit
}
}
} else {
work_sender.await;
unreachable!("work sender never terminates");
}
for t in tasks {
t.await.unwrap();
}
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
for stats in all_thread_local_stats.lock().unwrap().iter() {
let stats = stats.lock().unwrap();
agg_stats.add(&stats);
}
agg_stats.output()
},
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
anyhow::Ok(())
}
#[derive(Copy, Clone)]
struct Work {
lsn: Option<Lsn>,
gzip: bool,
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<Work>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
&args.page_service_host_port,
args.pageserver_jwt.as_deref(),
))
.await
.unwrap();
while let Some(Work { lsn, gzip }) = work.recv().await {
let start = Instant::now();
let copy_out_stream = client
.basebackup(&BasebackupRequest {
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
lsn,
gzip,
})
.await
.with_context(|| format!("start basebackup for {timeline}"))
.unwrap();
use futures::StreamExt;
let size = Arc::new(AtomicUsize::new(0));
copy_out_stream
.for_each({
|r| {
let size = Arc::clone(&size);
async move {
let size = Arc::clone(&size);
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
}
}
})
.await;
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -1,335 +0,0 @@
use anyhow::Context;
use futures::future::join_all;
use pageserver::pgdatadir_mapping::key_to_rel_block;
use pageserver::repository;
use pageserver_api::key::is_rel_block_key;
use pageserver_client::page_service::RelTagBlockNo;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use rand::prelude::*;
use tokio::sync::Barrier;
use tokio::task::JoinSet;
use tracing::{info, instrument};
use std::collections::HashMap;
use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(long, default_value = "1")]
num_clients: NonZeroUsize,
#[clap(long)]
runtime: Option<humantime::Duration>,
#[clap(long)]
per_target_rate_limit: Option<usize>,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
#[derive(Debug, Default)]
struct LiveStats {
completed_requests: AtomicU64,
}
impl LiveStats {
fn inc(&self) {
self.completed_requests.fetch_add(1, Ordering::Relaxed);
}
}
#[derive(Clone)]
struct KeyRange {
timeline: TenantTimelineId,
timeline_lsn: Lsn,
start: i128,
end: i128,
}
impl KeyRange {
fn len(&self) -> i128 {
self.end - self.start
}
}
#[derive(serde::Serialize)]
struct Output {
total: request_stats::Output,
}
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
let mut js = JoinSet::new();
for timeline in &timelines {
js.spawn({
let mgmt_api_client = Arc::clone(&mgmt_api_client);
let timeline = *timeline;
async move {
let partitioning = mgmt_api_client
.keyspace(timeline.tenant_id, timeline.timeline_id)
.await?;
let lsn = partitioning.at_lsn;
let ranges = partitioning
.keys
.ranges
.iter()
.filter_map(|r| {
let start = r.start;
let end = r.end;
// filter out non-relblock keys
match (is_rel_block_key(&start), is_rel_block_key(&end)) {
(true, true) => Some(KeyRange {
timeline,
timeline_lsn: lsn,
start: start.to_i128(),
end: end.to_i128(),
}),
(true, false) | (false, true) => {
unimplemented!("split up range")
}
(false, false) => None,
}
})
.collect::<Vec<_>>();
anyhow::Ok(ranges)
}
});
}
let mut all_ranges: Vec<KeyRange> = Vec::new();
while let Some(res) = js.join_next().await {
all_ranges.extend(res.unwrap().unwrap());
}
let live_stats = Arc::new(LiveStats::default());
let num_client_tasks = timelines.len();
let num_live_stats_dump = 1;
let num_work_sender_tasks = 1;
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
));
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
tokio::spawn({
let stats = Arc::clone(&live_stats);
let start_work_barrier = Arc::clone(&start_work_barrier);
async move {
start_work_barrier.wait().await;
loop {
let start = std::time::Instant::now();
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
let elapsed = start.elapsed();
info!(
"RPS: {:.0}",
completed_requests as f64 / elapsed.as_secs_f64()
);
}
}
});
let mut work_senders = HashMap::new();
let mut tasks = Vec::new();
for tl in &timelines {
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
work_senders.insert(tl, sender);
tasks.push(tokio::spawn(client(
args,
*tl,
Arc::clone(&start_work_barrier),
receiver,
Arc::clone(&all_work_done_barrier),
Arc::clone(&live_stats),
)));
}
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
None => Box::pin(async move {
let weights = rand::distributions::weighted::WeightedIndex::new(
all_ranges.iter().map(|v| v.len()),
)
.unwrap();
start_work_barrier.wait().await;
loop {
let (range, key) = {
let mut rng = rand::thread_rng();
let r = &all_ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) =
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
(r, RelTagBlockNo { rel_tag, block_no })
};
let sender = work_senders.get(&range.timeline).unwrap();
// TODO: what if this blocks?
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
}
}),
Some(rps_limit) => Box::pin(async move {
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
let make_timeline_task: &dyn Fn(
TenantTimelineId,
)
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
let sender = work_senders.get(&timeline).unwrap();
let ranges: Vec<KeyRange> = all_ranges
.iter()
.filter(|r| r.timeline == timeline)
.cloned()
.collect();
let weights = rand::distributions::weighted::WeightedIndex::new(
ranges.iter().map(|v| v.len()),
)
.unwrap();
Box::pin(async move {
let mut ticker = tokio::time::interval(period);
ticker.set_missed_tick_behavior(
/* TODO review this choice */
tokio::time::MissedTickBehavior::Burst,
);
loop {
ticker.tick().await;
let (range, key) = {
let mut rng = rand::thread_rng();
let r = &ranges[weights.sample(&mut rng)];
let key: i128 = rng.gen_range(r.start..r.end);
let key = repository::Key::from_i128(key);
let (rel_tag, block_no) = key_to_rel_block(key)
.expect("we filter non-rel-block keys out above");
(r, RelTagBlockNo { rel_tag, block_no })
};
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
}
})
};
let tasks: Vec<_> = work_senders
.keys()
.map(|tl| make_timeline_task(**tl))
.collect();
start_work_barrier.wait().await;
join_all(tasks).await;
}),
};
if let Some(runtime) = args.runtime {
match tokio::time::timeout(runtime.into(), work_sender).await {
Ok(()) => unreachable!("work sender never terminates"),
Err(_timeout) => {
// this implicitly drops the work_senders, making all the clients exit
}
}
} else {
work_sender.await;
unreachable!("work sender never terminates");
}
for t in tasks {
t.await.unwrap();
}
let output = Output {
total: {
let mut agg_stats = request_stats::Stats::new();
for stats in all_thread_local_stats.lock().unwrap().iter() {
let stats = stats.lock().unwrap();
agg_stats.add(&stats);
}
agg_stats.output()
},
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
anyhow::Ok(())
}
#[instrument(skip_all)]
async fn client(
args: &'static Args,
timeline: TenantTimelineId,
start_work_barrier: Arc<Barrier>,
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
all_work_done_barrier: Arc<Barrier>,
live_stats: Arc<LiveStats>,
) {
start_work_barrier.wait().await;
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
.await
.unwrap();
let mut client = client
.pagestream(timeline.tenant_id, timeline.timeline_id)
.await
.unwrap();
while let Some((key, lsn)) = work.recv().await {
let start = Instant::now();
client
.getpage(key, lsn)
.await
.with_context(|| format!("getpage for {timeline}"))
.unwrap();
let elapsed = start.elapsed();
live_stats.inc();
STATS.with(|stats| {
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
});
}
all_work_done_barrier.wait().await;
}

View File

@@ -1,85 +0,0 @@
use std::sync::Arc;
use humantime::Duration;
use tokio::task::JoinSet;
use utils::id::TenantTimelineId;
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "localhost:64000")]
page_service_host_port: String,
#[clap(long)]
pageserver_jwt: Option<String>,
#[clap(
long,
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
)]
poll_for_completion: Option<Duration>,
#[clap(long)]
limit_to_first_n_targets: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(args));
rt.block_on(main_task).unwrap()
}
async fn main_impl(args: Args) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: args.limit_to_first_n_targets,
targets: args.targets.clone(),
},
)
.await?;
// kick it off
let mut js = JoinSet::new();
for tl in timelines {
let mgmt_api_client = Arc::clone(&mgmt_api_client);
js.spawn(async move {
// TODO: API to explicitly trigger initial logical size computation.
// Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
// => https://github.com/neondatabase/neon/issues/6168
let info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
if let Some(period) = args.poll_for_completion {
let mut ticker = tokio::time::interval(period.into());
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
let mut info = info;
while !info.current_logical_size_is_accurate {
ticker.tick().await;
info = mgmt_api_client
.timeline_info(tl.tenant_id, tl.timeline_id)
.await
.unwrap();
}
}
});
}
while let Some(res) = js.join_next().await {
let _: () = res.unwrap();
}
Ok(())
}

View File

@@ -1,48 +0,0 @@
use clap::Parser;
use utils::logging;
/// Re-usable pieces of code that aren't CLI-specific.
mod util {
pub(crate) mod connstring;
pub(crate) mod request_stats;
#[macro_use]
pub(crate) mod tokio_thread_local_stats;
/// Re-usable pieces of CLI-specific code.
pub(crate) mod cli {
pub(crate) mod targets;
}
}
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
mod cmd {
pub(super) mod basebackup;
pub(super) mod getpage_latest_lsn;
pub(super) mod trigger_initial_size_calculation;
}
/// Component-level performance test for pageserver.
#[derive(clap::Parser)]
enum Args {
Basebackup(cmd::basebackup::Args),
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
}
fn main() {
logging::init(
logging::LogFormat::Plain,
logging::TracingErrorLayerEnablement::Disabled,
logging::Output::Stderr,
)
.unwrap();
let args = Args::parse();
match args {
Args::Basebackup(args) => cmd::basebackup::main(args),
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
Args::TriggerInitialSizeCalculation(args) => {
cmd::trigger_initial_size_calculation::main(args)
}
}
.unwrap()
}

View File

@@ -1,34 +0,0 @@
use std::sync::Arc;
use pageserver_client::mgmt_api;
use tracing::info;
use utils::id::TenantTimelineId;
pub(crate) struct Spec {
pub(crate) limit_to_first_n_targets: Option<usize>,
pub(crate) targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) async fn discover(
api_client: &Arc<mgmt_api::Client>,
spec: Spec,
) -> anyhow::Result<Vec<TenantTimelineId>> {
let mut timelines = if let Some(targets) = spec.targets {
targets
} else {
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
};
if let Some(limit) = spec.limit_to_first_n_targets {
timelines.sort(); // for determinism
timelines.truncate(limit);
if timelines.len() < limit {
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
}
}
info!("timelines:\n{:?}", timelines);
info!("number of timelines:\n{:?}", timelines.len());
Ok(timelines)
}

View File

@@ -1,8 +0,0 @@
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
let colon_and_jwt = if let Some(jwt) = jwt {
format!(":{jwt}") // TODO: urlescape
} else {
String::new()
};
format!("postgres://postgres{colon_and_jwt}@{host_port}")
}

View File

@@ -1,88 +0,0 @@
use std::time::Duration;
use anyhow::Context;
pub(crate) struct Stats {
latency_histo: hdrhistogram::Histogram<u64>,
}
impl Stats {
pub(crate) fn new() -> Self {
Self {
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
// which would skew the benchmark results.
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
}
}
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
let micros: u64 = latency
.as_micros()
.try_into()
.context("latency greater than u64")?;
self.latency_histo
.record(micros)
.context("add to histogram")?;
Ok(())
}
pub(crate) fn output(&self) -> Output {
let latency_percentiles = std::array::from_fn(|idx| {
let micros = self
.latency_histo
.value_at_percentile(LATENCY_PERCENTILES[idx]);
Duration::from_micros(micros)
});
Output {
request_count: self.latency_histo.len(),
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
latency_percentiles: LatencyPercentiles {
latency_percentiles,
},
}
}
pub(crate) fn add(&mut self, other: &Self) {
let Self {
ref mut latency_histo,
} = self;
latency_histo.add(&other.latency_histo).unwrap();
}
}
impl Default for Stats {
fn default() -> Self {
Self::new()
}
}
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
struct LatencyPercentiles {
latency_percentiles: [Duration; 4],
}
impl serde::Serialize for LatencyPercentiles {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeMap;
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
for p in LATENCY_PERCENTILES {
ser.serialize_entry(
&format!("p{p}"),
&format!(
"{}",
&humantime::format_duration(self.latency_percentiles[0])
),
)?;
}
ser.end()
}
}
#[derive(serde::Serialize)]
pub(crate) struct Output {
request_count: u64,
#[serde(with = "humantime_serde")]
latency_mean: Duration,
latency_percentiles: LatencyPercentiles,
}

View File

@@ -1,45 +0,0 @@
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
macro_rules! declare {
($THREAD_LOCAL_NAME:ident: $T:ty) => {
thread_local! {
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
);
}
};
}
use std::sync::{Arc, Mutex};
pub(crate) use declare;
macro_rules! main {
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
let main_impl = $main_impl;
let all = Arc::new(Mutex::new(Vec::new()));
let rt = tokio::runtime::Builder::new_multi_thread()
.on_thread_start({
let all = Arc::clone(&all);
move || {
// pre-initialize the thread local stats by accessesing them
// (some stats like requests_stats::Stats are quite costly to initialize,
// we don't want to pay that cost during the measurement period)
$THREAD_LOCAL_NAME.with(|stats| {
let stats: Arc<_> = Arc::clone(&*stats.borrow());
all.lock().unwrap().push(stats);
});
}
})
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(all));
rt.block_on(main_task).unwrap()
}};
}
pub(crate) use main;

View File

@@ -1468,7 +1468,6 @@ threshold = "20m"
period: Duration::from_secs(10),
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
})
);
match &conf.default_tenant_conf.eviction_policy {

View File

@@ -74,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
pub period: Duration,
#[cfg(feature = "testing")]
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
/// Select sorting for evicted layers
#[serde(default)]
pub eviction_order: EvictionOrder,
}
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
/// partitioning.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "type", content = "args")]
pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed in absolute
/// time.
///
/// This strategy is unfair when some tenants grow faster than others towards the slower
/// growing.
#[default]
AbsoluteAccessed,
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
///
/// This strategy will evict layers more fairly but is untested.
RelativeAccessed {
#[serde(default)]
highest_layer_count_loses_first: bool,
},
}
impl EvictionOrder {
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
/// counts should be the first ones to have their layers evicted.
fn highest_layer_count_loses_first(&self) -> bool {
match self {
EvictionOrder::AbsoluteAccessed => false,
EvictionOrder::RelativeAccessed {
highest_layer_count_loses_first,
} => *highest_layer_count_loses_first,
}
}
}
#[derive(Default)]
@@ -231,14 +192,7 @@ async fn disk_usage_eviction_task_iteration(
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(
state,
storage,
usage_pre,
task_config.eviction_order,
cancel,
)
.await;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -324,7 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -344,7 +297,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
"running disk usage based eviction due to pressure"
);
let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
let candidates = match collect_eviction_candidates(cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
@@ -354,16 +307,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Debug-log the list of candidates
let now = SystemTime::now();
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let nth = i + 1;
let desc = candidate.layer.layer_desc();
let total_candidates = candidates.len();
let size = desc.file_size;
let rel = candidate.relative_last_activity;
debug!(
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
i + 1,
candidates.len(),
desc.file_size,
now.duration_since(candidate.last_activity_ts)
.unwrap()
.as_micros(),
partition,
desc.tenant_shard_id,
desc.timeline_id,
candidate.layer,
@@ -506,7 +459,6 @@ struct EvictionCandidate {
timeline: Arc<Timeline>,
layer: Layer,
last_activity_ts: SystemTime,
relative_last_activity: finite_f32::FiniteF32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -526,24 +478,24 @@ enum EvictionCandidates {
/// order. A caller that evicts in that order, until pressure is relieved, implements
/// the eviction policy outlined in the module comment.
///
/// # Example with EvictionOrder::AbsoluteAccessed
/// # Example
///
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
/// Each layer has size 100, and both tenant's min_resident_size is 150.
/// The eviction order would be
///
/// ```text
/// partition last_activity_ts tenant/layer
/// Above 18:30 A/c
/// Above 19:00 A/b
/// Above 18:29 B/c
/// Above 19:05 B/b
/// Above 20:00 B/a
/// Above 20:03 A/a
/// Below 20:30 A/d
/// Below 20:40 B/d
/// Below 20:45 B/e
/// Below 20:58 A/e
/// partition last_activity_ts tenant/layer
/// Above 18:30 A/c
/// Above 19:00 A/b
/// Above 18:29 B/c
/// Above 19:05 B/b
/// Above 20:00 B/a
/// Above 20:03 A/a
/// Below 20:30 A/d
/// Below 20:40 B/d
/// Below 20:45 B/e
/// Below 20:58 A/e
/// ```
///
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -553,77 +505,7 @@ enum EvictionCandidates {
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
/// after exhauting the `Above` partition.
/// So, we did not respect each tenant's min_resident_size.
///
/// # Example with EvictionOrder::RelativeAccessed
///
/// ```text
/// partition relative_age last_activity_ts tenant/layer
/// Above 0/4 18:30 A/c
/// Above 0/4 18:29 B/c
/// Above 1/4 19:00 A/b
/// Above 1/4 19:05 B/b
/// Above 2/4 20:00 B/a
/// Above 2/4 20:03 A/a
/// Below 3/4 20:30 A/d
/// Below 3/4 20:40 B/d
/// Below 4/4 20:45 B/e
/// Below 4/4 20:58 A/e
/// ```
///
/// With tenants having the same number of layers the picture does not change much. The same with
/// A having many more layers **resident** (not all of them listed):
///
/// ```text
/// Above 0/100 18:30 A/c
/// Above 0/4 18:29 B/c
/// Above 1/100 19:00 A/b
/// Above 2/100 20:03 A/a
/// Above 3/100 20:03 A/nth_3
/// Above 4/100 20:03 A/nth_4
/// ...
/// Above 1/4 19:05 B/b
/// Above 25/100 20:04 A/nth_25
/// ...
/// Above 2/4 20:00 B/a
/// Above 50/100 20:10 A/nth_50
/// ...
/// Below 3/4 20:40 B/d
/// Below 99/100 20:30 A/nth_99
/// Below 4/4 20:45 B/e
/// Below 100/100 20:58 A/nth_100
/// ```
///
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
/// appeared:
///
/// ```text
/// Above 0/87 20:04 A/nth_23
/// Above 0/3 19:05 B/b
/// Above 0/50 20:59 C/nth_0
/// Above 1/87 20:04 A/nth_24
/// Above 1/50 21:00 C/nth_1
/// Above 2/87 20:04 A/nth_25
/// ...
/// Above 16/50 21:02 C/nth_16
/// Above 1/3 20:00 B/a
/// Above 27/87 20:10 A/nth_50
/// ...
/// Below 2/3 20:40 B/d
/// Below 49/50 21:05 C/nth_49
/// Below 86/87 20:30 A/nth_99
/// Below 3/3 20:45 B/e
/// Below 50/50 21:05 C/nth_50
/// Below 87/87 20:58 A/nth_100
/// ```
///
/// Now relieving pressure with 23 layers would cost:
/// - tenant A 14 layers
/// - tenant B 1 layer
/// - tenant C 8 layers
async fn collect_eviction_candidates(
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
// get a snapshot of the list of tenants
@@ -709,63 +591,12 @@ async fn collect_eviction_candidates(
tenant_candidates
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
// keeping the -1 or not decides if every tenant should lose their least recently accessed
// layer OR if this should happen in the order of having highest layer count:
let fudge = if eviction_order.highest_layer_count_loses_first() {
// relative_age vs. tenant layer count:
// - 0.1..=1.0 (10 layers)
// - 0.01..=1.0 (100 layers)
// - 0.001..=1.0 (1000 layers)
//
// leading to evicting less of the smallest tenants.
0
} else {
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
// be that less than 10k layer evictions is enough, so we would not need to evict from
// all tenants.
//
// as the tenant ordering is now deterministic this could hit the same tenants
// disproportionetly on multiple invocations. alternative could be to remember how many
// layers did we evict last time from this tenant, and inject that as an additional
// fudge here.
1
};
let total = tenant_candidates
.len()
.checked_sub(fudge)
.filter(|&x| x > 0)
// support 0 or 1 resident layer tenants as well
.unwrap_or(1);
let divider = total as f32;
for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
for (timeline, layer_info) in tenant_candidates.into_iter() {
let file_size = layer_info.file_size();
// as we iterate this reverse sorted list, the most recently accessed layer will always
// be 1.0; this is for us to evict it last.
let relative_last_activity = if matches!(
eviction_order,
EvictionOrder::RelativeAccessed { .. }
) {
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
// similarly for u16. unsure how it would help.
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
.unwrap_or_else(|val| {
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
finite_f32::FiniteF32::ZERO
})
} else {
finite_f32::FiniteF32::ZERO
};
let candidate = EvictionCandidate {
timeline,
last_activity_ts: layer_info.last_activity_ts,
layer: layer_info.layer,
relative_last_activity,
};
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
@@ -779,19 +610,8 @@ async fn collect_eviction_candidates(
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
match eviction_order {
EvictionOrder::AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
EvictionOrder::RelativeAccessed { .. } => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
});
}
}
candidates
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
Ok(EvictionCandidates::Finished(candidates))
}
@@ -820,66 +640,6 @@ impl std::ops::Deref for TimelineKey {
}
}
/// A totally ordered f32 subset we can use with sorting functions.
mod finite_f32 {
/// A totally ordered f32 subset we can use with sorting functions.
#[derive(Clone, Copy, PartialEq)]
pub struct FiniteF32(f32);
impl std::fmt::Debug for FiniteF32 {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(&self.0, f)
}
}
impl std::fmt::Display for FiniteF32 {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Display::fmt(&self.0, f)
}
}
impl std::cmp::Eq for FiniteF32 {}
impl std::cmp::PartialOrd for FiniteF32 {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl std::cmp::Ord for FiniteF32 {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.0.total_cmp(&other.0)
}
}
impl TryFrom<f32> for FiniteF32 {
type Error = f32;
fn try_from(value: f32) -> Result<Self, Self::Error> {
if value.is_finite() {
Ok(FiniteF32(value))
} else {
Err(value)
}
}
}
impl FiniteF32 {
pub const ZERO: FiniteF32 = FiniteF32(0.0);
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
if (0.0..=1.0).contains(&value) {
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
let value = value.abs();
Ok(FiniteF32(value))
} else {
Err(value)
}
}
}
}
mod filesystem_level_usage {
use anyhow::Context;
use camino::Utf8Path;
@@ -961,7 +721,6 @@ mod filesystem_level_usage {
#[test]
fn max_usage_pct_pressure() {
use super::EvictionOrder;
use super::Usage as _;
use std::time::Duration;
use utils::serde_percent::Percent;
@@ -973,7 +732,6 @@ mod filesystem_level_usage {
period: Duration::MAX,
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: EvictionOrder::default(),
},
total_bytes: 100_000,
avail_bytes: 0,

View File

@@ -15,13 +15,10 @@ use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLoadRequest, TenantLocationConfigRequest,
};
use pageserver_api::shard::ShardCount;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tenant_size_model::{SizeResult, StorageModel};
@@ -262,7 +259,7 @@ impl From<SetNewTenantConfigError> for ApiError {
SetNewTenantConfigError::GetTenant(tid) => {
ApiError::NotFound(anyhow!("tenant {}", tid).into())
}
e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
e @ SetNewTenantConfigError::Persist(_) => {
ApiError::InternalServerError(anyhow::Error::new(e))
}
}
@@ -989,25 +986,6 @@ async fn tenant_size_handler(
)
}
async fn tenant_shard_split_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let req: TenantShardSplitRequest = json_request(&mut request).await?;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let state = get_state(&request);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let new_shards = state
.tenant_manager
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
}
async fn layer_map_info_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -1172,7 +1150,6 @@ async fn tenant_create_handler(
state.conf,
tenant_conf,
target_tenant_id,
request_data.shard_parameters,
generation,
state.tenant_resources(),
&ctx,
@@ -1589,22 +1566,19 @@ async fn disk_usage_eviction_run(
struct Config {
/// How many bytes to evict before reporting that pressure is relieved.
evict_bytes: u64,
#[serde(default)]
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
}
#[derive(Debug, Clone, Copy, serde::Serialize)]
struct Usage {
// remains unchanged after instantiation of the struct
evict_bytes: u64,
config: Config,
// updated by `add_available_bytes`
freed_bytes: u64,
}
impl crate::disk_usage_eviction_task::Usage for Usage {
fn has_pressure(&self) -> bool {
self.evict_bytes > self.freed_bytes
self.config.evict_bytes > self.freed_bytes
}
fn add_available_bytes(&mut self, bytes: u64) {
@@ -1615,7 +1589,7 @@ async fn disk_usage_eviction_run(
let config = json_request::<Config>(&mut r).await?;
let usage = Usage {
evict_bytes: config.evict_bytes,
config,
freed_bytes: 0,
};
@@ -1630,11 +1604,7 @@ async fn disk_usage_eviction_run(
let state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
storage,
usage,
config.eviction_order,
&cancel,
&state, storage, usage, &cancel,
)
.await;
@@ -1846,9 +1816,6 @@ pub fn make_router(
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
})
.put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
api_handler(r, tenant_shard_split_handler)
})
.get("/v1/tenant/:tenant_shard_id/config", |r| {
api_handler(r, get_tenant_config_handler)
})

View File

@@ -405,20 +405,13 @@ impl PageServerHandler {
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
// that we look up here may not be the one that serves all the actual requests: we will double
// check the mapping of key->shard later before calling into Timeline for getpage requests.
let tenant = match mgr::get_active_tenant_with_timeout(
let tenant = mgr::get_active_tenant_with_timeout(
tenant_id,
ShardSelector::First,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await
{
Ok(t) => t,
Err(e) => {
tracing::warn!("Error at start of handle_pagerequests: {}", e);
return Err(e.into());
}
};
.await?;
// Make request tracer if needed
let mut tracer = if tenant.get_trace_read_requests() {
@@ -433,18 +426,9 @@ impl PageServerHandler {
};
// Check that the timeline exists
let timeline = match tenant.get_timeline(timeline_id, true) {
Ok(t) => t,
Err(e) => {
tracing::warn!("Error getting timeline: {}", e);
return Err(QueryError::Other(anyhow::anyhow!(e)));
}
};
tracing::info!(
"handle_pagerequests: got timeline {}",
timeline.tenant_shard_id
);
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
// Avoid starting new requests if the timeline has already started shutting down,
// and block timeline shutdown until this request is complete, or drops out due
@@ -831,10 +815,6 @@ impl PageServerHandler {
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
tracing::debug!(
"handle_get_page_at_lsn: using shard {}",
timeline.tenant_shard_id
);
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
@@ -864,18 +844,11 @@ impl PageServerHandler {
// informed yet.
//
// https://github.com/neondatabase/neon/issues/6038
tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard{}, key {}",
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
return Err(anyhow::anyhow!("Request routed to wrong shard"));
}
Err(e) => return Err(e.into()),
};
tracing::debug!(
"handle_get_page_at_lsn: using shard {}",
timeline.tenant_shard_id
);
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;

View File

@@ -1776,7 +1776,6 @@ pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
@@ -1791,6 +1790,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}

View File

@@ -17,7 +17,6 @@ use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::FutureExt;
use futures::StreamExt;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TimelineState;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::shard::TenantShardId;
@@ -50,7 +49,6 @@ use self::metadata::TimelineMetadata;
use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::mgr::TenantsMap;
use self::remote_timeline_client::upload::upload_index_part;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineExclusionError;
use self::timeline::uninit::TimelineUninitMark;
@@ -1533,7 +1531,6 @@ impl Tenant {
})?;
if active_only && !timeline.is_active() {
tracing::warn!("Timeline {} is not active", timeline.timeline_id);
Err(GetTimelineError::NotActive {
tenant_id: self.tenant_shard_id.tenant_id,
timeline_id,
@@ -2307,66 +2304,6 @@ impl Tenant {
pub(crate) fn get_generation(&self) -> Generation {
self.generation
}
pub(crate) async fn split_prepare(
&self,
child_shards: &Vec<TenantShardId>,
) -> anyhow::Result<()> {
let timelines = self.timelines.lock().unwrap().clone();
for timeline in timelines.values() {
let Some(tl_client) = &timeline.remote_client else {
anyhow::bail!("Remote storage is mandatory");
};
let Some(remote_storage) = &self.remote_storage else {
anyhow::bail!("Remote storage is mandatory");
};
// TODO: some higher level should enforce that timeline creation/deletion does not
// happen concurrently with splits. This is impossible to safely coordinate locally
// within one single pageserver's view of the world.
// Upload an index from the parent: this is partly to provide freshness for the
// child tenants that will copy it, and partly for general ease-of-debugging: there will
// always be a parent shard index in the same generation as we wrote the child shard index.
tl_client.schedule_index_upload_for_file_changes()?;
tl_client.wait_completion().await?;
// Shut down the timeline's remote client: this means that the indices we write
// for child shards will not be invalidated by the parent shard deleting layers.
tl_client.shutdown().await?;
// Download methods can still be used after shutdown, as they don't flow through the remote client's
// queue.
// TODO: create a way for remote timeline client to give us a copy of the last IndexPart it uploaded
// without having to download it again.
// TODO: carry a cancellation token in here
let result = tl_client
.download_index_file(CancellationToken::new())
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
.await?;
let index_part = match result {
MaybeDeletedIndexPart::Deleted(_) => {
anyhow::bail!("Timeline deletion happened concurrently with split")
}
MaybeDeletedIndexPart::IndexPart(p) => p,
};
for child_shard in child_shards {
upload_index_part(
remote_storage,
child_shard,
&timeline.timeline_id,
self.generation,
&index_part,
&self.cancel,
)
.await?;
}
}
Ok(())
}
}
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2710,11 +2647,10 @@ impl Tenant {
}
}
// Legacy configs are implicitly in attached state, and do not support sharding
// Legacy configs are implicitly in attached state
Ok(LocationConf::attached_single(
tenant_conf,
Generation::none(),
&ShardParameters::default(),
))
} else {
// FIXME If the config file is not found, assume that we're attaching
@@ -3198,7 +3134,6 @@ impl Tenant {
/// For unit tests, make this visible so that other modules can directly create timelines
#[cfg(test)]
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn bootstrap_timeline_test(
&self,
timeline_id: TimelineId,
@@ -3289,45 +3224,43 @@ impl Tenant {
// Upload the created data dir to S3
if let Some(storage) = &self.remote_storage {
if self.tenant_shard_id().is_zero() {
let temp_path = timelines_path.join(format!(
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
let temp_path = timelines_path.join(format!(
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
storage,
&self.tenant_shard_id.tenant_id,
&timeline_id,
pgdata_zstd.try_clone().await?,
tar_zst_size,
&self.cancel,
)
.await
},
|_| false,
3,
u32::MAX,
"persist_initdb_tar_zst",
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await?;
tokio::fs::remove_file(&temp_path)
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
storage,
&self.tenant_shard_id.tenant_id,
&timeline_id,
pgdata_zstd.try_clone().await?,
tar_zst_size,
&self.cancel,
)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {temp_path}"))?;
}
},
|_| false,
3,
u32::MAX,
"persist_initdb_tar_zst",
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
)
.await?;
tokio::fs::remove_file(&temp_path)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {temp_path}"))?;
}
}
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
@@ -3685,10 +3618,6 @@ impl Tenant {
Ok(())
}
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
self.tenant_conf.read().unwrap().tenant_conf
}
}
fn remove_timeline_and_uninit_mark(
@@ -4130,7 +4059,6 @@ pub(crate) mod harness {
AttachedTenantConf::try_from(LocationConf::attached_single(
TenantConfOpt::from(self.tenant_conf),
self.generation,
&ShardParameters::default(),
))
.unwrap(),
// This is a legacy/test code path: sharding isn't supported here.

View File

@@ -9,7 +9,7 @@
//! may lead to a data loss.
//!
use anyhow::bail;
use pageserver_api::models::{self, ShardParameters};
use pageserver_api::models;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
@@ -24,7 +24,7 @@ pub mod defaults {
// which is good for now to trigger bugs.
// This parameter actually determines L0 layer file size.
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 s";
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
// Target file size, when creating image and delta layers.
// This parameter determines L1 layer file size.
@@ -165,17 +165,14 @@ impl LocationConf {
/// For use when loading from a legacy configuration: presence of a tenant
/// implies it is in AttachmentMode::Single, which used to be the only
/// possible state. This function should eventually be removed.
pub(crate) fn attached_single(
tenant_conf: TenantConfOpt,
generation: Generation,
shard_params: &ShardParameters,
) -> Self {
pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
Self {
mode: LocationMode::Attached(AttachedLocationConfig {
generation,
attach_mode: AttachmentMode::Single,
}),
shard: ShardIdentity::from_params(ShardNumber(0), shard_params),
// Legacy configuration loads are always from tenants created before sharding existed.
shard: ShardIdentity::unsharded(),
tenant_conf,
}
}

View File

@@ -2,10 +2,8 @@
//! page server.
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::ShardParameters;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
@@ -22,7 +20,7 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use remote_storage::GenericRemoteStorage;
use utils::{completion, crashsafe};
use utils::crashsafe;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -132,7 +130,7 @@ impl TenantsMap {
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
fn resolve_attached_shard(
fn resolve_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
@@ -142,27 +140,25 @@ impl TenantsMap {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
_ => continue,
};
match selector {
ShardSelector::First => return Some(*slot.0),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return Some(*slot.0)
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if let Some(tenant) = slot.1.get_attached() {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
} else {
continue;
}
}
_ => continue,
@@ -518,7 +514,10 @@ pub async fn init_tenant_mgr(
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
tenants.insert(
TenantShardId::unsharded(tenant.tenant_id()),
TenantSlot::Attached(tenant),
);
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -620,6 +619,8 @@ pub(crate) async fn shutdown_all_tenants() {
}
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
use utils::completion;
let mut join_set = JoinSet::new();
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -745,21 +746,13 @@ pub(crate) async fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_shard_id: TenantShardId,
shard_params: ShardParameters,
generation: Generation,
resources: TenantSharedResources,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
let location_conf = LocationConf::attached_single(tenant_conf, generation);
info!("Creating tenant at location {location_conf:?}");
if shard_params.count != ShardCount(1) {
return Err(TenantMapInsertError::Other(anyhow::anyhow!(
"Only single-shard tenant creations may be serviced directly by a pageserver"
)));
}
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
@@ -794,8 +787,6 @@ pub(crate) enum SetNewTenantConfigError {
GetTenant(#[from] GetTenantError),
#[error(transparent)]
Persist(anyhow::Error),
#[error(transparent)]
Other(anyhow::Error),
}
pub(crate) async fn set_new_tenant_config(
@@ -809,21 +800,10 @@ pub(crate) async fn set_new_tenant_config(
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
// Note that we use ShardParameters::default below.
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
)));
}
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(
new_tenant_conf,
tenant.generation,
&ShardParameters::default(),
);
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
.await
@@ -982,27 +962,35 @@ impl TenantManager {
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
// Directory structure is the same for attached and secondary modes:
// create it if it doesn't exist. Timeline load/creation expects the
// timelines/ subdir to already exist.
//
// Does not need to be fsync'd because local storage is just a cache.
tokio::fs::create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
// Before activating either secondary or attached mode, persist the
// configuration, so that on restart we will re-attach (or re-start
// secondary) on the tenant.
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => TenantSlot::Secondary,
LocationMode::Secondary(_) => {
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
tokio::fs::create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
}
LocationMode::Attached(_attach_config) => {
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
tokio::fs::create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let shard_identity = new_location_config.shard;
let tenant = tenant_spawn(
self.conf,
@@ -1114,112 +1102,6 @@ impl TenantManager {
.collect(),
}
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
pub(crate) async fn shard_split(
&self,
tenant_shard_id: TenantShardId,
new_shard_count: ShardCount,
ctx: &RequestContext,
) -> anyhow::Result<Vec<TenantShardId>> {
let tenant = get_tenant(tenant_shard_id, true)?;
// Plan: identify what the new child shards will be
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
if new_shard_count <= ShardCount(effective_old_shard_count) {
anyhow::bail!("Requested shard count is not an increase");
}
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
if expansion_factor & (expansion_factor - 1) != 0 {
anyhow::bail!("Requested split is not a power of two");
}
// Key mapping is based on a round robin mapping of key hash modulo shard count,
// so our child shards are the ones which the same keys would map to.
let mut child_shards = Vec::new();
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
if shard_number % effective_old_shard_count == tenant_shard_id.shard_number.0 {
child_shards.push(TenantShardId {
tenant_id: tenant_shard_id.tenant_id,
shard_number: ShardNumber(shard_number),
shard_count: new_shard_count,
})
}
}
let parent_shard_identity = tenant.shard_identity;
let parent_tenant_conf = tenant.get_tenant_conf();
let parent_generation = tenant.generation;
// TODO: write a unit test for this
tracing::info!(
"Shard {} splits into: {}",
tenant_shard_id.to_index(),
child_shards
.iter()
.map(|id| format!("{}", id.to_index()))
.join(",")
);
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
tenant.split_prepare(&child_shards).await?;
self.resources.deletion_queue_client.flush_advisory();
// Phase 2: Put the parent shard to InProgress and shut it down
drop(tenant);
let mut parent_slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
match parent_slot_guard.get_old_value() {
Some(TenantSlot::Attached(t)) => {
let (_guard, progress) = completion::channel();
match t.shutdown(progress, false).await {
Ok(()) => {}
Err(other) => {
other.wait().await;
}
}
}
Some(TenantSlot::Secondary) => {}
Some(TenantSlot::InProgress(_)) => {
unreachable!()
}
None => {
// We don't actually need the parent shard to still be attached to do our work, but it's
// a weird enough situation that the caller probably didn't want us to continue working
// if they had detached the tenant they requested the split on.
anyhow::bail!("Detached parent shard in the middle of split!")
}
};
parent_slot_guard.drop_old_value()?;
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
// TODO: erase the dentries from the parent
// Phase 3: Spawn the child shards
for child_shard in &child_shards {
let mut child_shard_identity = parent_shard_identity;
child_shard_identity.count = child_shard.shard_count;
child_shard_identity.number = child_shard.shard_number;
let child_location_conf = LocationConf {
mode: LocationMode::Attached(AttachedLocationConfig {
generation: parent_generation,
attach_mode: AttachmentMode::Single,
}),
shard: child_shard_identity,
tenant_conf: parent_tenant_conf,
};
self.upsert_location(*child_shard, child_location_conf, None, ctx)
.await?;
}
// Phase 4: Release the InProgress on the parent shard
drop(parent_slot_guard);
Ok(child_shards)
}
}
#[derive(Debug, thiserror::Error)]
@@ -1321,11 +1203,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
let locked = TENANTS.read().unwrap();
// Resolve TenantId to TenantShardId
let tenant_shard_id = locked
.resolve_attached_shard(&tenant_id, shard_selector)
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)))?;
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
)?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
@@ -1661,11 +1541,10 @@ pub(crate) async fn attach_tenant(
) -> Result<(), TenantMapInsertError> {
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let shard_params = ShardParameters::default();
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
@@ -2108,6 +1987,8 @@ async fn remove_tenant_from_memory<V, F>(
where
F: std::future::Future<Output = anyhow::Result<V>>,
{
use utils::completion;
let mut slot_guard =
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

View File

@@ -182,7 +182,7 @@
pub(crate) mod download;
pub mod index;
pub(crate) mod upload;
mod upload;
use anyhow::Context;
use camino::Utf8Path;
@@ -690,10 +690,7 @@ impl RemoteTimelineClient {
.insert(layer.layer_desc().filename(), metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
info!(
"scheduled layer file upload {layer} gen={:?} shard={:?}",
metadata.generation, metadata.shard
);
info!("scheduled layer file upload {layer}");
let op = UploadOp::UploadLayer(layer, metadata);
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -821,25 +818,8 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
) {
// Filter out any layers which were not created by this tenant shard. These are
// layers that originate from some ancestor shard after a split, and may still
// be referenced by other shards. We are free to delete them locally and remove
// them from our index (and would have already done so when we reach this point
// in the code), but we may not delete them remotely.
with_metadata.retain(|(name, meta)| {
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
if !retain {
tracing::debug!(
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
meta.shard
);
}
retain
});
for (name, meta) in &with_metadata {
info!(
"scheduling deletion of layer {}{} (shard {})",
@@ -2212,6 +2192,15 @@ mod tests {
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
timeline_path
.strip_prefix(&test_state.harness.conf.workdir)
.unwrap(),
);
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(
&test_state.harness.tenant_shard_id,
@@ -2220,10 +2209,6 @@ mod tests {
)
.get_path(),
);
std::fs::create_dir_all(index_path.parent().unwrap())
.expect("creating test dir should work");
eprintln!("Writing {index_path}");
std::fs::write(&index_path, index_part_bytes).unwrap();
example_index_part

View File

@@ -25,7 +25,7 @@ use super::index::LayerFileMetadata;
use tracing::info;
/// Serializes and uploads the given index part data to the remote storage.
pub(crate) async fn upload_index_part<'a>(
pub(super) async fn upload_index_part<'a>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,

View File

@@ -878,23 +878,6 @@ impl LayerInner {
Ok(())
}
Err(e) => {
let consecutive_failures =
this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
let backoff = utils::backoff::exponential_backoff_duration_seconds(
consecutive_failures.min(u32::MAX as usize) as u32,
1.5,
60.0,
);
let backoff = std::time::Duration::from_secs_f64(backoff);
tokio::select! {
_ = tokio::time::sleep(backoff) => {},
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
_ = timeline.cancel.cancelled() => {},
};
Err(e)
}
};
@@ -943,9 +926,21 @@ impl LayerInner {
Ok(permit)
}
Ok((Err(e), _permit)) => {
// sleep already happened in the spawned task, if it was not cancelled
let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
// FIXME: this should be with the spawned task and be cancellation sensitive
//
// while we should not need this, this backoff has turned out to be useful with
// a bug of unexpectedly deleted remote layer file (#5787).
let consecutive_failures =
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
let backoff = utils::backoff::exponential_backoff_duration_seconds(
consecutive_failures.min(u32::MAX as usize) as u32,
1.5,
60.0,
);
let backoff = std::time::Duration::from_secs_f64(backoff);
tokio::time::sleep(backoff).await;
Err(DownloadError::DownloadFailed)
}
Err(_gone) => Err(DownloadError::DownloadCancelled),

View File

@@ -903,15 +903,10 @@ impl Timeline {
background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext,
) {
tracing::info!("activate 1");
self.spawn_initial_logical_size_computation_task(ctx);
tracing::info!("activate 2");
self.launch_wal_receiver(ctx, broker_client);
tracing::info!("activate 3");
self.set_state(TimelineState::Active);
tracing::info!("activate 4");
self.launch_eviction_task(background_jobs_can_start);
tracing::info!("activate 5");
}
/// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then

View File

@@ -1612,7 +1612,6 @@ impl<'a> WalIngest<'a> {
mod tests {
use super::*;
use crate::tenant::harness::*;
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
use postgres_ffi::RELSEG_SIZE;
@@ -2178,25 +2177,21 @@ mod tests {
let pg_version = 15; // The test data was generated by pg15
let path = "test_data/sk_wal_segment_from_pgbench";
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
let source_initdb_path = format!("{path}/{INITDB_PATH}");
let startpoint = Lsn::from_hex("14AEC08").unwrap();
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
let (tenant, ctx) = harness.load().await;
let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
std::fs::create_dir_all(initdb_path.parent().unwrap())
.expect("creating test dir should work");
std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
// Bootstrap a real timeline. We can't use create_test_timeline because
// it doesn't create a real checkpoint, and Walingest::new tries to parse
// the garbage data.
//
// TODO use the initdb.tar.zst file stored with the test data to avoid
// problems with inconsistent initdb results after pg minor version bumps.
let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
.unwrap()
.load()
.await;
let tline = tenant
.bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
.bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
.await
.unwrap();

View File

@@ -308,13 +308,13 @@ lfc_change_limit_hook(int newval, void *extra)
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
neon_log(LOG, "Failed to punch hole in file: %m");
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
lfc_ctl->used -= 1;
}
lfc_ctl->limit = new_size;
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
elog(DEBUG1, "set local file cache limit to %d", new_size);
LWLockRelease(lfc_lock);
}
@@ -327,7 +327,7 @@ lfc_init(void)
* shared_preload_libraries.
*/
if (!process_shared_preload_libraries_in_progress)
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
elog(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomIntVariable("neon.max_file_cache_size",
@@ -643,7 +643,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
neon_log(DEBUG2, "Swap file cache page");
elog(DEBUG2, "Swap file cache page");
}
else
{
@@ -846,10 +846,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
* wrong) function definition though.
*/
if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
neon_log(ERROR, "return type must be a row type");
elog(ERROR, "return type must be a row type");
if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM)
neon_log(ERROR, "incorrect number of output arguments");
elog(ERROR, "incorrect number of output arguments");
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);

View File

@@ -15,7 +15,6 @@
#include "postgres.h"
#include "access/xlog.h"
#include "common/hashfn.h"
#include "fmgr.h"
#include "libpq-fe.h"
#include "libpq/libpq.h"
@@ -38,6 +37,17 @@
#define RECONNECT_INTERVAL_USEC 1000000
bool connected = false;
PGconn *pageserver_conn = NULL;
/*
* WaitEventSet containing:
* - WL_SOCKET_READABLE on pageserver_conn,
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *pageserver_conn_wes = NULL;
/* GUCs */
char *neon_timeline;
char *neon_tenant;
@@ -48,206 +58,87 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
static int stripe_size;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static void AssignPageserverConnstring(const char *newval, void *extra);
static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source);
typedef struct
{
LWLockId lock;
pg_atomic_uint64 update_counter;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
} PagestoreShmemState;
#if PG_VERSION_NUM >= 150000
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
/*
* ShardMap is kept in shared memory. It contains the connection strings for
* each shard.
*
* There is "neon.pageserver_connstring" GUC with PGC_SIGHUP option, allowing to change it using
* pg_reload_conf(). It is used by control plane to update shards information if page server is crashed,
* relocated or new shards are added. This GUC variable contains comma separated list of connection strings.
* It is copied to shared memory because config can not be loaded during query execution and we need to
* reestablish connection to page server.
*
* So usually copying connection string to shared memory is done by postmaster. And other backends
* should check update counter to determine of connection URL is changed and connection needs to be reestablished.
*
* But at startup shared memory is not yet initialized and so we need to copy in some other process.
* Moreover, we can not use standard Postgres LW-locks, because postmaster has proc entry and so can not wait
* on this primitive. This is why lockless access algorithm is implemented using two atomic counters to enforce
* consistent reading of connection string value from shared memory.
*/
typedef struct
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
static bool
PagestoreShmemIsValid()
{
size_t n_shards;
pg_atomic_uint64 begin_update_counter;
pg_atomic_uint64 end_update_counter;
char shard_connstr[MAX_SHARDS][MAX_PS_CONNSTR_LEN];
} ShardMap;
static ShardMap* shard_map;
static uint64 shard_map_update_counter;
typedef struct
{
/*
* Connection for each shard
*/
PGconn *conn;
/*
* WaitEventSet containing:
* - WL_SOCKET_READABLE on 'conn'
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *wes;
} PageServer;
static PageServer page_servers[MAX_SHARDS];
static shardno_t max_attached_shard_no;
static void
psm_shmem_startup(void)
{
bool found;
if (prev_shmem_startup_hook)
{
prev_shmem_startup_hook();
}
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
shard_map = (ShardMap*)ShmemInitStruct("shard_map", sizeof(ShardMap), &found);
if (!found)
{
shard_map->n_shards = 0;
pg_atomic_init_u64(&shard_map->begin_update_counter, 0);
pg_atomic_init_u64(&shard_map->end_update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
}
static void
psm_shmem_request(void)
{
#if PG_VERSION_NUM>=150000
if (prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(sizeof(ShardMap));
}
static void
psm_init(void)
{
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = psm_shmem_startup;
#if PG_VERSION_NUM>=150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = psm_shmem_request;
#else
psm_shmem_request();
#endif
}
/*
* Reload page map if needed and return number of shards and connection string for the specified shard
* 'connstr' is an output buffer. If not NULL, it must point to a buffer at least MAX_PS_CONNSTR_LEN bytes
* long. The connection string for the gven shard is copied to it.
*/
static shardno_t
load_shard_map(shardno_t shard_no, char* connstr)
{
shardno_t n_shards;
uint64 begin_update_counter;
uint64 end_update_counter;
/*
* There is race condition here between backend and postmaster which can update shard map.
* We recheck update counter after copying shard map to check that configuration was not changed.
*/
do
{
begin_update_counter = pg_atomic_read_u64(&shard_map->begin_update_counter);
end_update_counter = pg_atomic_read_u64(&shard_map->end_update_counter);
n_shards = shard_map->n_shards;
if (shard_no >= n_shards)
neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);
if (connstr)
{
/*
* We need to use strlcpy here because due to race condition string oin shared memory
* may be not zero terminated.
*/
strlcpy(connstr, shard_map->shard_connstr[shard_no], MAX_PS_CONNSTR_LEN);
pg_memory_barrier();
}
}
while (begin_update_counter != end_update_counter
|| begin_update_counter != pg_atomic_read_u64(&shard_map->begin_update_counter)
|| end_update_counter != pg_atomic_read_u64(&shard_map->end_update_counter));
if (shard_map_update_counter != end_update_counter)
{
/* Reset all connections if connection strings are changed */
for (shardno_t i = 0; i < max_attached_shard_no; i++)
{
if (page_servers[i].conn)
pageserver_disconnect(i);
}
max_attached_shard_no = 0;
shard_map_update_counter = end_update_counter;
}
return n_shards;
}
#define MB (1024*1024)
shardno_t
get_shard_number(BufferTag* tag)
{
shardno_t n_shards = load_shard_map(0, NULL);
uint32 hash;
#if PG_MAJORVERSION_NUM < 16
hash = murmurhash32(tag->rnode.relNode);
hash = hash_combine(hash, murmurhash32(tag->blockNum/stripe_size));
#else
hash = murmurhash32(tag->relNumber);
hash = hash_combine(hash, murmurhash32(tag->blockNum/stripe_size));
#endif
return hash % n_shards;
return pagestore_shared && UsedShmemSegAddr;
}
static bool
pageserver_connect(shardno_t shard_no, int elevel)
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
if (!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
LWLockRelease(pagestore_shared->lock);
}
static bool
CheckConnstringUpdated()
{
if (!PagestoreShmemIsValid())
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
}
static void
ReloadConnstring()
{
if (!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
LWLockRelease(pagestore_shared->lock);
}
static bool
pageserver_connect(int elevel)
{
char *query;
int ret;
const char *keywords[3];
const char *values[3];
int n;
PGconn* conn;
WaitEventSet *wes;
char connstr[MAX_PS_CONNSTR_LEN];
Assert(page_servers[shard_no].conn == NULL);
Assert(!connected);
(void)load_shard_map(shard_no, connstr); /* refresh page map if needed */
if (CheckConnstringUpdated())
{
ReloadConnstring();
}
/*
* Connect using the connection string we got from the
@@ -267,93 +158,50 @@ pageserver_connect(shardno_t shard_no, int elevel)
n++;
}
keywords[n] = "dbname";
values[n] = connstr;
values[n] = local_pageserver_connstring;
n++;
keywords[n] = NULL;
values[n] = NULL;
n++;
conn = PQconnectdbParams(keywords, values, 1);
pageserver_conn = PQconnectdbParams(keywords, values, 1);
if (PQstatus(conn) == CONNECTION_BAD)
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(conn));
char *msg = pchomp(PQerrorMessage(pageserver_conn));
PQfinish(conn);
PQfinish(pageserver_conn);
pageserver_conn = NULL;
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errmsg(NEON_TAG "could not establish connection to pageserver"),
errdetail_internal("%s", msg)));
return false;
}
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
ret = PQsendQuery(conn, query);
ret = PQsendQuery(pageserver_conn, query);
if (ret != 1)
{
PQfinish(conn);
neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
PQfinish(pageserver_conn);
pageserver_conn = NULL;
neon_log(elevel, "could not send pagestream command to pageserver");
return false;
}
wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
while (PQisBusy(conn))
while (PQisBusy(pageserver_conn))
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(conn))
{
char *msg = pchomp(PQerrorMessage(conn));
PQfinish(conn);
FreeWaitEventSet(wes);
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
msg);
return false;
}
}
}
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
page_servers[shard_no].conn = conn;
page_servers[shard_no].wes = wes;
max_attached_shard_no = Max(shard_no+1, max_attached_shard_no);
return true;
}
/*
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(shardno_t shard_no, char **buffer)
{
int ret;
PGconn* pageserver_conn = page_servers[shard_no].conn;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
if (ret == 0)
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -365,7 +213,53 @@ retry:
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
PQfinish(pageserver_conn);
pageserver_conn = NULL;
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
neon_log(elevel, "could not complete handshake with pageserver: %s",
msg);
return false;
}
}
}
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
connected = true;
return true;
}
/*
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(char **buffer)
{
int ret;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
if (ret == 0)
{
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_log(LOG, "could not get response from pageserver: %s", msg);
pfree(msg);
return -1;
}
@@ -379,7 +273,7 @@ retry:
static void
pageserver_disconnect(shardno_t shard_no)
pageserver_disconnect(void)
{
/*
* If anything goes wrong while we were sending a request, it's not clear
@@ -388,32 +282,38 @@ pageserver_disconnect(shardno_t shard_no)
* time later after we have already sent a new unrelated request. Close
* the connection to avoid getting confused.
*/
if (page_servers[shard_no].conn)
if (connected)
{
neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
PQfinish(page_servers[shard_no].conn);
page_servers[shard_no].conn = NULL;
neon_log(LOG, "dropping connection to page server due to error");
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
prefetch_on_ps_disconnect();
}
if (page_servers[shard_no].wes != NULL)
if (pageserver_conn_wes != NULL)
{
FreeWaitEventSet(page_servers[shard_no].wes);
page_servers[shard_no].wes = NULL;
FreeWaitEventSet(pageserver_conn_wes);
pageserver_conn_wes = NULL;
}
}
static bool
pageserver_send(shardno_t shard_no, NeonRequest *request)
pageserver_send(NeonRequest *request)
{
StringInfoData req_buff;
PGconn* pageserver_conn = page_servers[shard_no].conn;
if (CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
/* If the connection was lost for some reason, reconnect */
if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_send disconnect bad connection");
pageserver_disconnect();
}
req_buff = nm_pack_request(request);
@@ -427,9 +327,9 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
* connection in case of failure.
*/
if (!page_servers[shard_no].conn)
if (!connected)
{
while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
{
HandleMainLoopInterrupts();
n_reconnect_attempts += 1;
@@ -438,9 +338,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
n_reconnect_attempts = 0;
}
pageserver_conn = page_servers[shard_no].conn;
/*
/*
* Send request.
*
* In principle, this could block if the output buffer is full, and we
@@ -451,8 +349,9 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pageserver_disconnect();
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pfree(msg);
pfree(req_buff.data);
return false;
@@ -464,19 +363,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
{
char *msg = nm_to_string((NeonMessage *) request);
neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
neon_log(PageStoreTrace, "sent request: %s", msg);
pfree(msg);
}
return true;
}
static NeonResponse *
pageserver_receive(shardno_t shard_no)
pageserver_receive(void)
{
StringInfoData resp_buff;
NeonResponse *resp;
PGconn* pageserver_conn = page_servers[shard_no].conn;
if (!pageserver_conn)
if (!connected)
return NULL;
PG_TRY();
@@ -484,7 +383,7 @@ pageserver_receive(shardno_t shard_no)
/* read response */
int rc;
rc = call_PQgetCopyData(shard_no, &resp_buff.data);
rc = call_PQgetCopyData(&resp_buff.data);
if (rc >= 0)
{
resp_buff.len = rc;
@@ -496,33 +395,33 @@ pageserver_receive(shardno_t shard_no)
{
char *msg = nm_to_string((NeonMessage *) resp);
neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
neon_log(PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
}
else if (rc == -1)
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
pageserver_disconnect();
resp = NULL;
}
else if (rc == -2)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
pageserver_disconnect();
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
}
else
{
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
pageserver_disconnect();
neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
}
}
PG_CATCH();
{
neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
pageserver_disconnect(shard_no);
neon_log(LOG, "pageserver_receive disconnect due to caught exception");
pageserver_disconnect();
PG_RE_THROW();
}
PG_END_TRY();
@@ -532,12 +431,11 @@ pageserver_receive(shardno_t shard_no)
static bool
pageserver_flush(shardno_t shard_no)
pageserver_flush(void)
{
PGconn* pageserver_conn = page_servers[shard_no].conn;
if (!pageserver_conn)
if (!connected)
{
neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
neon_log(WARNING, "Tried to flush while disconnected");
}
else
{
@@ -545,8 +443,8 @@ pageserver_flush(shardno_t shard_no)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect(shard_no);
neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
pageserver_disconnect();
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
pfree(msg);
return false;
}
@@ -569,83 +467,63 @@ check_neon_id(char **newval, void **extra, GucSource source)
return **newval == '\0' || HexDecodeString(id, *newval, 16);
}
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
static Size
PagestoreShmemSize(void)
{
const char* shard_connstr = *newval;
const char* sep;
size_t connstr_len;
int i = 0;
do
{
sep = strchr(shard_connstr, ',');
connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
if (connstr_len == 0)
break; /* trailing comma */
if (i >= MAX_SHARDS)
{
neon_log(LOG, "Too many shards");
return false;
}
if (connstr_len >= MAX_PS_CONNSTR_LEN)
{
neon_log(LOG, "Connection string too long");
return false;
}
shard_connstr = sep + 1;
i += 1;
} while (sep != NULL);
return sizeof(PagestoreShmemState);
}
return true;
static bool
PagestoreShmemInit(void)
{
bool found;
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
&found);
if (!found)
{
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
return found;
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
pagestore_shmem_startup_hook(void)
{
/*
* Load shard map only at Postmaster.
* If old page server is not available, then backends can be blocked in attempts to reconnect to it and do not reload config in this loop
*
* Copying GUC value to shared memory is usually performed by postmaster. But in case of startup,
* shared memory is not yet initialized. So it has to be performed by any other process.
* It is not a problem if more than one process do this initialization.
*/
if (shard_map != NULL && UsedShmemSegAddr != NULL && (MyProcPid == PostmasterPid || shard_map->n_shards == 0))
{
const char* shard_connstr = newval;
const char* sep;
size_t connstr_len;
int i = 0;
bool shard_map_changed = false;
do
{
sep = strchr(shard_connstr, ',');
connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
if (connstr_len == 0)
break; /* trailing comma */
Assert(i < MAX_SHARDS);
Assert(connstr_len < MAX_PS_CONNSTR_LEN);
if (i >= shard_map->n_shards ||
strcmp(shard_map->shard_connstr[i], shard_connstr) != 0)
{
if (!shard_map_changed)
{
pg_atomic_add_fetch_u64(&shard_map->begin_update_counter, 1);
shard_map_changed = true;
}
memcpy(shard_map->shard_connstr[i], shard_connstr, connstr_len+1);
}
shard_connstr = sep + 1;
i += 1;
} while (sep != NULL);
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
if (shard_map_changed)
{
shard_map->n_shards = i;
pg_memory_barrier();
pg_atomic_add_fetch_u64(&shard_map->end_update_counter, 1);
}
}
PagestoreShmemInit();
}
static void
pagestore_shmem_request(void)
{
#if PG_VERSION_NUM >= 150000
if (prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(PagestoreShmemSize());
RequestNamedLWLockTranche("neon_libpagestore", 1);
}
static void
pagestore_prepare_shmem(void)
{
#if PG_VERSION_NUM >= 150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = pagestore_shmem_request;
#else
pagestore_shmem_request();
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = pagestore_shmem_startup_hook;
}
/*
@@ -654,6 +532,8 @@ AssignPageserverConnstring(const char *newval, void *extra)
void
pg_init_libpagestore(void)
{
pagestore_prepare_shmem();
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
@@ -681,15 +561,6 @@ pg_init_libpagestore(void)
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
NULL,
&stripe_size,
32768, 1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_BLOCKS,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.max_cluster_size",
"cluster size limit",
NULL,
@@ -753,5 +624,4 @@ pg_init_libpagestore(void)
}
lfc_init();
psm_init();
}

View File

@@ -17,20 +17,12 @@
#include "access/xlogdefs.h"
#include RELFILEINFO_HDR
#include "storage/block.h"
#include "storage/smgr.h"
#include "storage/buf_internals.h"
#include "lib/stringinfo.h"
#include "libpq/pqformat.h"
#include "storage/block.h"
#include "storage/smgr.h"
#include "utils/memutils.h"
#include "pg_config.h"
#define MAX_SHARDS 128
#define MAX_PS_CONNSTR_LEN 128
typedef enum
{
/* pagestore_client -> pagestore */
@@ -59,9 +51,6 @@ typedef struct
#define neon_log(tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
/*
* supertype of all the Neon*Request structs below
@@ -152,13 +141,11 @@ extern char *nm_to_string(NeonMessage *msg);
* API
*/
typedef unsigned shardno_t;
typedef struct
{
bool (*send) (shardno_t shard_no, NeonRequest * request);
NeonResponse *(*receive) (shardno_t shard_no);
bool (*flush) (shardno_t shard_no);
bool (*send) (NeonRequest *request);
NeonResponse *(*receive) (void);
bool (*flush) (void);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);
@@ -172,8 +159,6 @@ extern char *neon_timeline;
extern char *neon_tenant;
extern int32 max_cluster_size;
extern shardno_t get_shard_number(BufferTag* tag);
extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
extern void smgr_init_neon(void);
extern void readahead_buffer_resize(int newsize, void *extra);

View File

@@ -172,7 +172,6 @@ typedef struct PrefetchRequest
XLogRecPtr actual_request_lsn;
NeonResponse *response; /* may be null */
PrefetchStatus status;
shardno_t shard_no;
uint64 my_ring_index;
} PrefetchRequest;
@@ -240,9 +239,7 @@ typedef struct PrefetchState
* also unused */
/* the buffers */
prfh_hash *prf_hash;
int max_shard_no;
uint8 shard_bitmap[(MAX_SHARDS + 7)/8];
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
} PrefetchState;
@@ -330,7 +327,6 @@ compact_prefetch_buffers(void)
Assert(target_slot->status == PRFS_UNUSED);
target_slot->buftag = source_slot->buftag;
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -498,23 +494,6 @@ prefetch_cleanup_trailing_unused(void)
}
}
static bool
prefetch_flush_requests(void)
{
for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
{
if (MyPState->shard_bitmap[shard_no >> 3] & (1 << (shard_no & 7)))
{
if (!page_server->flush(shard_no))
return false;
MyPState->shard_bitmap[shard_no >> 3] &= ~(1 << (shard_no & 7));
}
}
MyPState->max_shard_no = 0;
return true;
}
/*
* Wait for slot of ring_index to have received its response.
* The caller is responsible for making sure the request buffer is flushed.
@@ -530,7 +509,7 @@ prefetch_wait_for(uint64 ring_index)
if (MyPState->ring_flush <= ring_index &&
MyPState->ring_unused > MyPState->ring_flush)
{
if (!prefetch_flush_requests())
if (!page_server->flush())
return false;
MyPState->ring_flush = MyPState->ring_unused;
}
@@ -568,7 +547,7 @@ prefetch_read(PrefetchRequest *slot)
Assert(slot->my_ring_index == MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive(slot->shard_no);
response = (NeonResponse *) page_server->receive();
MemoryContextSwitchTo(old);
if (response)
{
@@ -725,14 +704,12 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
while (!page_server->send((NeonRequest *) &request));
/* update prefetch state */
MyPState->n_requests_inflight += 1;
MyPState->n_unused -= 1;
MyPState->ring_unused += 1;
MyPState->shard_bitmap[slot->shard_no >> 3] |= 1 << (slot->shard_no & 7);
MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
/* update slot state */
slot->status = PRFS_REQUESTED;
@@ -903,7 +880,6 @@ Retry:
* function reads the buffer tag from the slot.
*/
slot->buftag = tag;
slot->shard_no = get_shard_number(&tag);
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_latest, force_lsn);
@@ -914,7 +890,7 @@ Retry:
if (flush_every_n_requests > 0 &&
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
{
if (!prefetch_flush_requests())
if (!page_server->flush())
{
/*
* Prefetch set is reset in case of error, so we should try to
@@ -932,44 +908,13 @@ static NeonResponse *
page_server_request(void const *req)
{
NeonResponse *resp;
BufferTag tag = {0};
shardno_t shard_no;
switch (((NeonRequest *) req)->tag)
{
case T_NeonExistsRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
break;
case T_NeonNblocksRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
break;
case T_NeonDbSizeRequest:
NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
break;
case T_NeonGetPageRequest:
CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
break;
default:
neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
}
shard_no = get_shard_number(&tag);
/*
* Current sharding model assumes that all metadata is present only at shard 0.
* We still need to call get_shard_no() to check if shard map is up-to-date.
*/
if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
{
shard_no = 0;
}
do
{
while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
MyPState->ring_flush = MyPState->ring_unused;
consume_prefetch_responses();
resp = page_server->receive(shard_no);
resp = page_server->receive();
} while (resp == NULL);
return resp;
@@ -1045,7 +990,7 @@ nm_pack_request(NeonRequest *msg)
case T_NeonErrorResponse:
case T_NeonDbSizeResponse:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
break;
}
return s;
@@ -1140,7 +1085,7 @@ nm_unpack_response(StringInfo s)
case T_NeonGetPageRequest:
case T_NeonDbSizeRequest:
default:
neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
elog(ERROR, "unexpected neon message tag 0x%02x", tag);
break;
}
@@ -1332,7 +1277,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
XLogFlush(recptr);
lsn = recptr;
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1360,7 +1305,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
if (PageIsNew((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros",
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1368,7 +1313,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else if (PageIsEmptyHeapPage((Page) buffer))
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1376,7 +1321,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(PANIC,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum)));
@@ -1385,7 +1330,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
else
{
ereport(SmgrTrace,
(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
blocknum,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1485,7 +1430,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
lsn = nm_adjust_lsn(lsn);
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
}
else
@@ -1500,7 +1445,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
*latest = true;
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
Assert(lsn != InvalidXLogRecPtr);
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
(uint32) ((lsn) >> 32), (uint32) (lsn));
lsn = nm_adjust_lsn(lsn);
@@ -1520,7 +1465,7 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
#endif
if (lsn > flushlsn)
{
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
(uint32) (lsn >> 32), (uint32) lsn,
(uint32) (flushlsn >> 32), (uint32) flushlsn);
XLogFlush(lsn);
@@ -1564,7 +1509,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return mdexists(reln, forkNum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
@@ -1616,7 +1561,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -1625,7 +1570,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);
return exists;
@@ -1642,7 +1587,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1653,10 +1598,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "Create relation %u/%u/%u.%u",
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum);
@@ -1751,7 +1696,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1762,7 +1707,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/*
@@ -1781,7 +1726,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,
(errcode(ERRCODE_DISK_FULL),
errmsg(NEON_TAG "could not extend file because project size limit (%d MB) has been exceeded",
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
max_cluster_size),
errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC")));
}
@@ -1800,7 +1745,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -1840,7 +1785,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -1851,7 +1796,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (max_cluster_size > 0 &&
@@ -1863,7 +1808,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
ereport(ERROR,
(errcode(ERRCODE_DISK_FULL),
errmsg(NEON_TAG "could not extend file because cluster size limit (%d MB) has been exceeded",
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
max_cluster_size),
errhint("This limit is defined by neon.max_cluster_size GUC")));
}
@@ -1876,7 +1821,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks",
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
@@ -1937,7 +1882,7 @@ neon_open(SMgrRelation reln)
mdopen(reln);
/* no work */
neon_log(SmgrTrace, "open noop");
elog(SmgrTrace, "[NEON_SMGR] open noop");
}
/*
@@ -1974,7 +1919,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
return mdprefetch(reln, forknum, blocknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
@@ -2019,11 +1964,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* not implemented */
neon_log(SmgrTrace, "writeback noop");
elog(SmgrTrace, "[NEON_SMGR] writeback noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2153,8 +2098,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
slot->shard_no, blkno,
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
blkno,
RelFileInfoFmt(rinfo),
forkNum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2162,7 +2107,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
((NeonErrorResponse *) resp)->message)));
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
/* buffer was used, clean up for later reuse */
@@ -2186,7 +2131,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
case RELPERSISTENCE_PERMANENT:
break;
@@ -2197,7 +2142,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* Try to read from local file cache */
@@ -2225,7 +2170,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
{
if (!PageIsNew((Page) pageserver_masked))
{
neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2235,7 +2180,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
}
else if (PageIsNew((Page) buffer))
{
neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2250,7 +2195,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2269,7 +2214,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
{
neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
blkno,
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
@@ -2349,13 +2294,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_wallog_page(reln, forknum, blocknum, buffer, false);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
@@ -2382,7 +2327,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2393,12 +2338,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
return mdnblocks(reln, forknum);
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
@@ -2426,7 +2371,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
@@ -2435,11 +2380,11 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
@@ -2482,7 +2427,7 @@ neon_dbsize(Oid dbNode)
case T_NeonErrorResponse:
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
@@ -2490,10 +2435,10 @@ neon_dbsize(Oid dbNode)
break;
default:
neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
db_size);
@@ -2513,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2525,7 +2470,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
@@ -2581,7 +2526,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2593,10 +2538,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
elog(SmgrTrace, "[NEON_SMGR] immedsync noop");
#ifdef DEBUG_COMPARE_LOCAL
if (IS_LOCAL_REL(reln))
@@ -2621,17 +2566,17 @@ neon_start_unlogged_build(SMgrRelation reln)
* progress at a time. That's enough for the current usage.
*/
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
neon_log(ERROR, "unlogged relation build is already in progress");
elog(ERROR, "unlogged relation build is already in progress");
Assert(unlogged_build_rel == NULL);
ereport(SmgrTrace,
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
(errmsg("starting unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
switch (reln->smgr_relpersistence)
{
case 0:
neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
@@ -2644,11 +2589,11 @@ neon_start_unlogged_build(SMgrRelation reln)
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
elog(ERROR, "cannot perform unlogged index build, index is not empty ");
unlogged_build_rel = reln;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
@@ -2675,7 +2620,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2704,7 +2649,7 @@ neon_end_unlogged_build(SMgrRelation reln)
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
(errmsg("ending unlogged build of relation %u/%u/%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
@@ -2719,7 +2664,7 @@ neon_end_unlogged_build(SMgrRelation reln)
rinfob = InfoBFromSMgrRel(reln);
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
@@ -2762,7 +2707,7 @@ AtEOXact_neon(XactEvent event, void *arg)
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
(errmsg(NEON_TAG "unlogged index build was not properly finished"))));
(errmsg("unlogged index build was not properly finished"))));
}
break;
}
@@ -2861,14 +2806,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
set_cached_relsize(rinfo, forknum, relsize);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
neon_log(SmgrTrace, "Set length to %d", relsize);
elog(SmgrTrace, "Set length to %d", relsize);
}
}
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
/*
* TODO: May be it is better to make correspondent function from freespace.c public?
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
*/
static BlockNumber
get_fsm_physical_block(BlockNumber heapblk)
@@ -2949,7 +2894,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
#if PG_VERSION_NUM < 150000
if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
neon_log(PANIC, "failed to locate backup block with ID %d", block_id);
elog(PANIC, "failed to locate backup block with ID %d", block_id);
#else
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
#endif

View File

@@ -1716,7 +1716,25 @@ walprop_pg_after_election(WalProposer *wp)
fclose(f);
if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
{
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
uint64 download_range_mb;
elog(LOG, "Logical replication restart LSN %X/%X, epochStartLsn %X/%X, max_slot_wal_keep_size_mb=%d",
LSN_FORMAT_ARGS(lrRestartLsn), LSN_FORMAT_ARGS(wp->propEpochStartLsn), max_slot_wal_keep_size_mb);
/*
* If we need to download more than a max_slot_wal_keep_size, cap to it to
* avoid risk of exploding pg_wal. Logical replication won't work until
* recreated, but at least compute would start; this also follows
* max_slot_wal_keep_size semantics.
*/
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / 1024 / 1024;
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
{
lrRestartLsn = wp->propEpochStartLsn - max_slot_wal_keep_size_mb * 1024 * 1024;
elog(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
}
/*
* start from the beginning of the segment to fetch page headers

View File

@@ -11,7 +11,6 @@ use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
use anyhow::bail;
@@ -96,8 +95,12 @@ struct ProxyCliArgs {
/// Allow self-signed certificates for compute nodes (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
allow_self_signed_compute: bool,
#[clap(flatten)]
sql_over_http: SqlOverHttpArgs,
/// timeout for http connections
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
/// timeout for scram authentication protocol
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
scram_protocol_timeout: tokio::time::Duration,
@@ -135,36 +138,6 @@ struct ProxyCliArgs {
disable_ip_check_for_http: bool,
}
#[derive(clap::Args, Clone, Copy, Debug)]
struct SqlOverHttpArgs {
/// timeout for http connection requests
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
/// How many connections to pool for each endpoint. Excess connections are discarded
#[clap(long, default_value_t = 20)]
sql_over_http_pool_max_conns_per_endpoint: usize,
/// How long pooled connections should remain idle for before closing
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
sql_over_http_idle_timeout: tokio::time::Duration,
/// Duration each shard will wait on average before a GC sweep.
/// A longer time will causes sweeps to take longer but will interfere less frequently.
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
sql_over_http_pool_gc_epoch: tokio::time::Duration,
/// How many shards should the global pool have. Must be a power of two.
/// More shards will introduce less contention for pool operations, but can
/// increase memory used by the pool
#[clap(long, default_value_t = 128)]
sql_over_http_pool_shards: usize,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _logging_guard = proxy::logging::init().await?;
@@ -354,14 +327,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
}
};
let http_config = HttpConfig {
request_timeout: args.sql_over_http.sql_over_http_timeout,
pool_options: GlobalConnPoolOptions {
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
pool_shards: args.sql_over_http.sql_over_http_pool_shards,
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
},
timeout: args.sql_over_http_timeout,
pool_opt_in: args.sql_over_http_pool_opt_in,
};
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,

View File

@@ -1,4 +1,4 @@
use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
use crate::{auth, rate_limiter::RateBucketInfo};
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
}
pub struct HttpConfig {
pub request_timeout: tokio::time::Duration,
pub pool_options: GlobalConnPoolOptions,
pub timeout: tokio::time::Duration,
pub pool_opt_in: bool,
}
pub struct AuthenticationConfig {

View File

@@ -6,13 +6,9 @@ mod conn_pool;
mod sql_over_http;
mod websocket;
pub use conn_pool::GlobalConnPoolOptions;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
use rand::rngs::StdRng;
use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio_util::task::TaskTracker;
@@ -51,11 +47,6 @@ pub async fn task_main(
let conn_pool = conn_pool::GlobalConnPool::new(config);
let conn_pool2 = Arc::clone(&conn_pool);
tokio::spawn(async move {
conn_pool2.gc_worker(StdRng::from_entropy()).await;
});
// shutdown the connection pool
tokio::spawn({
let cancellation_token = cancellation_token.clone();

View File

@@ -1,19 +1,15 @@
use anyhow::{anyhow, Context};
use async_trait::async_trait;
use dashmap::DashMap;
use futures::{future::poll_fn, Future};
use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
use once_cell::sync::Lazy;
use futures::future::poll_fn;
use parking_lot::RwLock;
use pbkdf2::{
password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
Params, Pbkdf2,
};
use pq_proto::StartupMessageParams;
use prometheus::{exponential_buckets, register_histogram, Histogram};
use rand::Rng;
use smol_str::SmolStr;
use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
use std::{collections::HashMap, net::IpAddr, sync::Arc};
use std::{
fmt,
task::{ready, Poll},
@@ -22,7 +18,7 @@ use std::{
ops::Deref,
sync::atomic::{self, AtomicUsize},
};
use tokio::time::{self, Instant};
use tokio::time;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
@@ -34,10 +30,11 @@ use crate::{
};
use crate::{compute, config};
use tracing::{debug, error, warn, Span};
use tracing::{error, warn, Span};
use tracing::{info, info_span, Instrument};
pub const APP_NAME: &str = "/sql_over_http";
const MAX_CONNS_PER_ENDPOINT: usize = 20;
#[derive(Debug, Clone)]
pub struct ConnInfo {
@@ -72,77 +69,6 @@ struct ConnPoolEntry {
pub struct EndpointConnPool {
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
total_conns: usize,
max_conns: usize,
_guard: IntCounterPairGuard,
}
impl EndpointConnPool {
fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
let Self {
pools, total_conns, ..
} = self;
pools
.get_mut(&db_user)
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
}
fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
let Self {
pools, total_conns, ..
} = self;
if let Some(pool) = pools.get_mut(&db_user) {
let old_len = pool.conns.len();
pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
let new_len = pool.conns.len();
let removed = old_len - new_len;
*total_conns -= removed;
removed > 0
} else {
false
}
}
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
let conn_id = client.conn_id;
if client.inner.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < pool.max_conns {
// we create this db-user entry in get, so it should not be None
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
}
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
Ok(())
}
}
/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -161,27 +87,6 @@ pub struct DbUserConnPool {
password_hash: Option<PasswordHashString>,
}
impl DbUserConnPool {
fn clear_closed_clients(&mut self, conns: &mut usize) {
let old_len = self.conns.len();
self.conns.retain(|conn| !conn.conn.inner.is_closed());
let new_len = self.conns.len();
let removed = old_len - new_len;
*conns -= removed;
}
fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
self.clear_closed_clients(conns);
let conn = self.conns.pop();
if conn.is_some() {
*conns -= 1;
}
conn
}
}
pub struct GlobalConnPool {
// endpoint -> per-endpoint connection pool
//
@@ -189,127 +94,52 @@ pub struct GlobalConnPool {
// pool as early as possible and release the lock.
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
/// Number of endpoint-connection pools
///
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
/// It's only used for diagnostics.
global_pool_size: AtomicUsize,
proxy_config: &'static crate::config::ProxyConfig,
}
#[derive(Debug, Clone, Copy)]
pub struct GlobalConnPoolOptions {
// Maximum number of connections per one endpoint.
// Can mix different (dbname, username) connections.
// When running out of free slots for a particular endpoint,
// falls back to opening a new connection for each request.
pub max_conns_per_endpoint: usize,
max_conns_per_endpoint: usize,
pub gc_epoch: Duration,
proxy_config: &'static crate::config::ProxyConfig,
pub pool_shards: usize,
pub idle_timeout: Duration,
pub opt_in: bool,
// Using a lock to remove any race conditions.
// Eg cleaning up connections while a new connection is returned
closed: RwLock<bool>,
}
pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_http_pool_reclaimation_lag_seconds",
"Time it takes to reclaim unused connection pools",
// 1us -> 65ms
exponential_buckets(1e-6, 2.0, 16).unwrap(),
)
.unwrap()
});
pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
register_int_counter_pair!(
"proxy_http_pool_endpoints_registered_total",
"Number of endpoints we have registered pools for",
"proxy_http_pool_endpoints_unregistered_total",
"Number of endpoints we have unregistered pools for",
)
.unwrap()
});
impl GlobalConnPool {
pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
let shards = config.http_config.pool_options.pool_shards;
Arc::new(Self {
global_pool: DashMap::with_shard_amount(shards),
global_pool: DashMap::new(),
global_pool_size: AtomicUsize::new(0),
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
proxy_config: config,
closed: RwLock::new(false),
})
}
pub fn shutdown(&self) {
// drops all strong references to endpoint-pools
self.global_pool.clear();
}
*self.closed.write() = true;
pub async fn gc_worker(&self, mut rng: impl Rng) {
let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
loop {
interval.tick().await;
self.global_pool.retain(|_, endpoint_pool| {
let mut pool = endpoint_pool.write();
// by clearing this hashmap, we remove the slots that a connection can be returned to.
// when returning, it drops the connection if the slot doesn't exist
pool.pools.clear();
pool.total_conns = 0;
let shard = rng.gen_range(0..self.global_pool.shards().len());
self.gc(shard);
}
}
fn gc(&self, shard: usize) {
debug!(shard, "pool: performing epoch reclamation");
// acquire a random shard lock
let mut shard = self.global_pool.shards()[shard].write();
let timer = GC_LATENCY.start_timer();
let current_len = shard.len();
shard.retain(|endpoint, x| {
// if the current endpoint pool is unique (no other strong or weak references)
// then it is currently not in use by any connections.
if let Some(pool) = Arc::get_mut(x.get_mut()) {
let EndpointConnPool {
pools, total_conns, ..
} = pool.get_mut();
// ensure that closed clients are removed
pools
.iter_mut()
.for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
// we only remove this pool if it has no active connections
if *total_conns == 0 {
info!("pool: discarding pool for endpoint {endpoint}");
return false;
}
}
true
false
});
let new_len = shard.len();
drop(shard);
timer.observe_duration();
let removed = current_len - new_len;
if removed > 0 {
let global_pool_size = self
.global_pool_size
.fetch_sub(removed, atomic::Ordering::Relaxed)
- removed;
info!("pool: performed global pool gc. size now {global_pool_size}");
}
}
pub async fn get(
self: &Arc<Self>,
conn_info: ConnInfo,
conn_info: &ConnInfo,
force_new: bool,
session_id: uuid::Uuid,
peer_addr: IpAddr,
@@ -317,11 +147,15 @@ impl GlobalConnPool {
let mut client: Option<ClientInner> = None;
let mut latency_timer = LatencyTimer::new("http");
let pool = if force_new {
None
} else {
Some((conn_info.clone(), self.clone()))
};
let mut hash_valid = false;
let mut endpoint_pool = Weak::new();
if !force_new {
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
endpoint_pool = Arc::downgrade(&pool);
let mut hash = None;
// find a pool entry by (dbname, username) if exists
@@ -346,8 +180,12 @@ impl GlobalConnPool {
// we will continue with the regular connection flow
if validate.is_ok() {
hash_valid = true;
if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
client = Some(entry.conn)
let mut pool = pool.write();
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
if let Some(entry) = pool_entries.conns.pop() {
client = Some(entry.conn);
pool.total_conns -= 1;
}
}
}
}
@@ -360,12 +198,11 @@ impl GlobalConnPool {
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(
self.proxy_config,
&conn_info,
conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
} else {
@@ -377,19 +214,18 @@ impl GlobalConnPool {
);
latency_timer.pool_hit();
latency_timer.success();
return Ok(Client::new(client, conn_info, endpoint_pool).await);
return Ok(Client::new(client, pool).await);
}
} else {
let conn_id = uuid::Uuid::new_v4();
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
connect_to_compute(
self.proxy_config,
&conn_info,
conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
};
@@ -433,7 +269,59 @@ impl GlobalConnPool {
_ => {}
}
let new_client = new_client?;
Ok(Client::new(new_client, conn_info, endpoint_pool).await)
Ok(Client::new(new_client, pool).await)
}
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
let conn_id = client.conn_id;
// We want to hold this open while we return. This ensures that the pool can't close
// while we are in the middle of returning the connection.
let closed = self.closed.read();
if *closed {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
return Ok(());
}
if client.inner.is_closed() {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
// return connection to the pool
let mut returned = false;
let mut per_db_size = 0;
let total_conns = {
let mut pool = pool.write();
if pool.total_conns < self.max_conns_per_endpoint {
// we create this db-user entry in get, so it should not be None
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
pool_entries.conns.push(ConnPoolEntry {
conn: client,
_last_access: std::time::Instant::now(),
});
returned = true;
per_db_size = pool_entries.conns.len();
pool.total_conns += 1;
}
}
pool.total_conns
};
// do logging outside of the mutex
if returned {
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
} else {
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
}
Ok(())
}
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -446,12 +334,6 @@ impl GlobalConnPool {
let new_pool = Arc::new(RwLock::new(EndpointConnPool {
pools: HashMap::new(),
total_conns: 0,
max_conns: self
.proxy_config
.http_config
.pool_options
.max_conns_per_endpoint,
_guard: ENDPOINT_POOLS.guard(),
}));
// find or create a pool for this endpoint
@@ -481,11 +363,9 @@ impl GlobalConnPool {
}
struct TokioMechanism<'a> {
pool: Weak<RwLock<EndpointConnPool>>,
conn_info: &'a ConnInfo,
session_id: uuid::Uuid,
conn_id: uuid::Uuid,
idle: Duration,
}
#[async_trait]
@@ -505,8 +385,6 @@ impl ConnectMechanism for TokioMechanism<'_> {
timeout,
self.conn_id,
self.session_id,
self.pool.clone(),
self.idle,
)
.await
}
@@ -525,7 +403,6 @@ async fn connect_to_compute(
session_id: uuid::Uuid,
latency_timer: LatencyTimer,
peer_addr: IpAddr,
pool: Weak<RwLock<EndpointConnPool>>,
) -> anyhow::Result<ClientInner> {
let tls = config.tls_config.as_ref();
let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -570,8 +447,6 @@ async fn connect_to_compute(
conn_id,
conn_info,
session_id,
pool,
idle: config.http_config.pool_options.idle_timeout,
},
node_info,
&extra,
@@ -587,8 +462,6 @@ async fn connect_to_compute_once(
timeout: time::Duration,
conn_id: uuid::Uuid,
mut session: uuid::Uuid,
pool: Weak<RwLock<EndpointConnPool>>,
idle: Duration,
) -> Result<ClientInner, tokio_postgres::Error> {
let mut config = (*node_info.config).clone();
@@ -617,29 +490,13 @@ async fn connect_to_compute_once(
branch_id: node_info.aux.branch_id.clone(),
};
let db_user = conn_info.db_and_user();
tokio::spawn(
async move {
let _conn_gauge = conn_gauge;
let mut idle_timeout = pin!(tokio::time::sleep(idle));
poll_fn(move |cx| {
if matches!(rx.has_changed(), Ok(true)) {
session = *rx.borrow_and_update();
info!(%session, "changed session");
idle_timeout.as_mut().reset(Instant::now() + idle);
}
// 5 minute idle connection timeout
if idle_timeout.as_mut().poll(cx).is_ready() {
idle_timeout.as_mut().reset(Instant::now() + idle);
info!("connection idle");
if let Some(pool) = pool.clone().upgrade() {
// remove client from pool - should close the connection if it's idle.
// does nothing if the client is currently checked-out and in-use
if pool.write().remove_client(db_user.clone(), conn_id) {
info!("idle connection removed");
}
}
}
loop {
@@ -657,25 +514,15 @@ async fn connect_to_compute_once(
}
Some(Err(e)) => {
error!(%session, "connection error: {}", e);
break
return Poll::Ready(())
}
None => {
info!("connection closed");
break
return Poll::Ready(())
}
}
}
// remove from connection pool
if let Some(pool) = pool.clone().upgrade() {
if pool.write().remove_client(db_user.clone(), conn_id) {
info!("closed connection removed");
}
}
Poll::Ready(())
}).await;
}).await
}
.instrument(span)
);
@@ -705,27 +552,23 @@ pub struct Client {
conn_id: uuid::Uuid,
span: Span,
inner: Option<ClientInner>,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool>>,
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
}
pub struct Discard<'a> {
conn_id: uuid::Uuid,
conn_info: &'a ConnInfo,
pool: &'a mut Weak<RwLock<EndpointConnPool>>,
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
}
impl Client {
pub(self) async fn new(
inner: ClientInner,
conn_info: ConnInfo,
pool: Weak<RwLock<EndpointConnPool>>,
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
) -> Self {
Self {
conn_id: inner.conn_id,
inner: Some(inner),
span: Span::current(),
conn_info,
pool,
}
}
@@ -734,7 +577,6 @@ impl Client {
inner,
pool,
conn_id,
conn_info,
span: _,
} = self;
(
@@ -744,7 +586,6 @@ impl Client {
.inner,
Discard {
pool,
conn_info,
conn_id: *conn_id,
},
)
@@ -760,14 +601,14 @@ impl Client {
impl Discard<'_> {
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
let conn_info = &self.conn_info;
if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
if status != ReadyForQueryStatus::Idle {
if let Some((conn_info, _)) = self.pool.take() {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
}
}
}
pub fn discard(&mut self) {
let conn_info = &self.conn_info;
if std::mem::take(self.pool).strong_count() > 0 {
if let Some((conn_info, _)) = self.pool.take() {
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
}
}
@@ -787,17 +628,16 @@ impl Deref for Client {
impl Drop for Client {
fn drop(&mut self) {
let conn_info = self.conn_info.clone();
let client = self
.inner
.take()
.expect("client inner should not be removed");
if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
if let Some((conn_info, conn_pool)) = self.pool.take() {
let current_span = self.span.clone();
// return connection to the pool
tokio::task::spawn_blocking(move || {
let _span = current_span.enter();
let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
let _ = conn_pool.put(&conn_info, client);
});
}
}

View File

@@ -206,7 +206,7 @@ pub async fn handle(
config: &'static HttpConfig,
) -> Result<Response<Body>, ApiError> {
let result = tokio::time::timeout(
config.request_timeout,
config.timeout,
handle_inner(
config,
request,
@@ -278,7 +278,7 @@ pub async fn handle(
Err(_) => {
let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
config.request_timeout.as_secs()
config.timeout.as_secs()
);
error!(message);
json_response(
@@ -320,8 +320,7 @@ async fn handle_inner(
// Allow connection pooling only if explicitly requested
// or if we have decided that http pool is no longer opt-in
let allow_pool =
!config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
// isolation level, read only and deferrable
@@ -360,7 +359,7 @@ async fn handle_inner(
let payload: Payload = serde_json::from_slice(&body)?;
let mut client = conn_pool
.get(conn_info, !allow_pool, session_id, peer_addr)
.get(&conn_info, !allow_pool, session_id, peer_addr)
.await?;
let mut response = Response::builder()

View File

@@ -1,12 +1,9 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use anyhow::Context;
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver_api::shard::ShardIndex;
use tracing::{error, info, warn};
use utils::generation::Generation;
use utils::id::TimelineId;
use crate::cloud_admin_api::BranchData;
use crate::metadata_stream::stream_listing;
@@ -43,7 +40,7 @@ impl TimelineAnalysis {
pub(crate) fn branch_cleanup_and_check_errors(
id: &TenantShardTimelineId,
tenant_objects: &mut TenantObjectListing,
s3_root: &RootTarget,
s3_active_branch: Option<&BranchData>,
console_branch: Option<BranchData>,
s3_data: Option<S3TimelineBlobData>,
@@ -75,8 +72,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
match s3_data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation: _index_part_generation,
s3_layers: _s3_layers,
index_part_generation,
mut s3_layers,
} => {
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
result.errors.push(format!(
@@ -114,19 +111,65 @@ pub(crate) fn branch_cleanup_and_check_errors(
))
}
if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
let layer_map_key = (layer, metadata.generation);
if !s3_layers.remove(&layer_map_key) {
// FIXME: this will emit false positives if an index was
// uploaded concurrently with our scan. To make this check
// correct, we need to try sending a HEAD request for the
// layer we think is missing.
result.errors.push(format!(
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
layer.file_name(),
metadata.generation.get_suffix(),
metadata.shard
"index_part.json contains a layer {}{} that is not present in remote storage",
layer_map_key.0.file_name(),
layer_map_key.1.get_suffix()
))
}
}
let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
.into_iter()
.filter(|(_layer_name, gen)|
// A layer is only considered orphaned if it has a generation below
// the index. If the generation is >= the index, then the layer may
// be an upload from a running pageserver, or even an upload from
// a new generation that didn't upload an index yet.
//
// Even so, a layer that is not referenced by the index could just
// be something enqueued for deletion, so while this check is valid
// for indicating that a layer is garbage, it is not an indicator
// of a problem.
gen < &index_part_generation)
.collect();
if !orphan_layers.is_empty() {
// An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
// these as a hint that there is something worth cleaning up here.
result.warnings.push(format!(
"index_part.json does not contain layers from S3: {:?}",
orphan_layers
.iter()
.map(|(layer_name, gen)| format!(
"{}{}",
layer_name.file_name(),
gen.get_suffix()
))
.collect::<Vec<_>>(),
));
result.garbage_keys.extend(orphan_layers.iter().map(
|(layer_name, layer_gen)| {
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
let delimiter = s3_root.delimiter();
if !key.ends_with(delimiter) {
key.push_str(delimiter);
}
key.push_str(&format!(
"{}{}",
&layer_name.file_name(),
layer_gen.get_suffix()
));
key
},
));
}
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -161,83 +204,6 @@ pub(crate) fn branch_cleanup_and_check_errors(
result
}
#[derive(Default)]
pub(crate) struct LayerRef {
ref_count: usize,
}
/// Top-level index of objects in a tenant. This may be used by any shard-timeline within
/// the tenant to query whether an object exists.
#[derive(Default)]
pub(crate) struct TenantObjectListing {
shard_timelines:
HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
}
impl TenantObjectListing {
/// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
/// list of layer keys for the Tenant.
pub(crate) fn push(
&mut self,
ttid: TenantShardTimelineId,
layers: HashSet<(LayerFileName, Generation)>,
) {
let shard_index = ShardIndex::new(
ttid.tenant_shard_id.shard_number,
ttid.tenant_shard_id.shard_count,
);
let replaced = self.shard_timelines.insert(
(shard_index, ttid.timeline_id),
layers
.into_iter()
.map(|l| (l, LayerRef::default()))
.collect(),
);
assert!(
replaced.is_none(),
"Built from an S3 object listing, which should never repeat a key"
);
}
/// Having loaded a timeline index, check if a layer referenced by the index exists. If it does,
/// the layer's refcount will be incremented. Later, after calling this for all references in all indices
/// in a tenant, orphan layers may be detected by their zero refcounts.
///
/// Returns true if the layer exists
pub(crate) fn check_ref(
&mut self,
timeline_id: TimelineId,
layer_file: &LayerFileName,
metadata: &IndexLayerMetadata,
) -> bool {
let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
return false;
};
let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
return false;
};
layer_ref.ref_count += 1;
true
}
pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
let mut result = Vec::new();
for ((shard_index, timeline_id), layers) in &self.shard_timelines {
for ((layer_file, generation), layer_ref) in layers {
if layer_ref.ref_count == 0 {
result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
}
}
}
result
}
}
#[derive(Debug)]
pub(crate) struct S3TimelineBlobData {
pub(crate) blob_data: BlobDataParseResult,

View File

@@ -15,7 +15,6 @@ use anyhow::Context;
use aws_config::environment::EnvironmentVariableCredentialsProvider;
use aws_config::imds::credentials::ImdsCredentialsProvider;
use aws_config::meta::credentials::CredentialsProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::sso::SsoCredentialsProvider;
use aws_config::BehaviorVersion;
use aws_sdk_s3::config::Region;
@@ -256,11 +255,6 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
let chain = CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder().build(),
);
// Use SSO if we were given an account ID
@@ -271,7 +265,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
.account_id(sso_account)
.role_name("PowerUserAccess")
.start_url("https://neondb.awsapps.com/start")
.region(bucket_region.clone())
.region(Region::from_static("eu-central-1"))
.build(),
),
None => chain,

View File

@@ -2,25 +2,22 @@ use std::collections::{HashMap, HashSet};
use crate::checks::{
branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
TenantObjectListing, TimelineAnalysis,
TimelineAnalysis,
};
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
use aws_sdk_s3::Client;
use futures_util::{pin_mut, StreamExt, TryStreamExt};
use histogram::Histogram;
use pageserver::tenant::remote_timeline_client::remote_layer_path;
use pageserver::tenant::IndexPart;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use utils::id::TenantId;
#[derive(Serialize)]
pub struct MetadataSummary {
count: usize,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
with_garbage: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
layer_count: MinMaxHisto,
@@ -90,7 +87,7 @@ impl MetadataSummary {
count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_orphans: HashSet::new(),
with_garbage: HashSet::new(),
indices_by_version: HashMap::new(),
layer_count: MinMaxHisto::new(),
timeline_size_bytes: MinMaxHisto::new(),
@@ -144,10 +141,6 @@ impl MetadataSummary {
}
}
fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
self.with_orphans.insert(*ttid);
}
/// Long-form output for printing at end of a scan
pub fn summary_string(&self) -> String {
let version_summary: String = itertools::join(
@@ -161,7 +154,7 @@ impl MetadataSummary {
"Timelines: {0}
With errors: {1}
With warnings: {2}
With orphan layers: {3}
With garbage: {3}
Index versions: {version_summary}
Timeline size bytes: {4}
Layer size bytes: {5}
@@ -170,7 +163,7 @@ Timeline layer count: {6}
self.count,
self.with_errors.len(),
self.with_warnings.len(),
self.with_orphans.len(),
self.with_garbage.len(),
self.timeline_size_bytes.oneline(),
self.layer_size_bytes.oneline(),
self.layer_count.oneline(),
@@ -198,7 +191,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
// Generate a stream of TenantTimelineId
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
let timelines = timelines.try_buffered(CONCURRENCY);
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
let timelines = timelines.try_flatten();
// Generate a stream of S3TimelineBlobData
@@ -211,118 +204,17 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
Ok((ttid, data))
}
let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
let timelines = timelines.try_buffered(CONCURRENCY);
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
// We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
// shards in the same tenant might refer to one anothers' keys if a shard split has happened.
let mut tenant_id = None;
let mut tenant_objects = TenantObjectListing::default();
let mut tenant_timeline_results = Vec::new();
fn analyze_tenant(
tenant_id: TenantId,
summary: &mut MetadataSummary,
mut tenant_objects: TenantObjectListing,
timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
) {
let mut timeline_generations = HashMap::new();
for (ttid, data) in timelines {
// Stash the generation of each timeline, for later use identifying orphan layers
if let BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation,
s3_layers: _s3_layers,
} = &data.blob_data
{
timeline_generations.insert(ttid, *index_part_generation);
}
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
// reference counts for layers across the tenant.
let analysis =
branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
summary.update_analysis(&ttid, &analysis);
}
// Identifying orphan layers must be done on a tenant-wide basis, because individual
// shards' layers may be referenced by other shards.
//
// Orphan layers are not a corruption, and not an indication of a problem. They are just
// consuming some space in remote storage, and may be cleaned up at leisure.
for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
let ttid = TenantShardTimelineId {
tenant_shard_id: TenantShardId {
tenant_id,
shard_count: shard_index.shard_count,
shard_number: shard_index.shard_number,
},
timeline_id,
};
if let Some(timeline_generation) = timeline_generations.get(&ttid) {
if &generation >= timeline_generation {
// Candidate orphan layer is in the current or future generation relative
// to the index we read for this timeline shard, so its absence from the index
// doesn't make it an orphan: more likely, it is a case where the layer was
// uploaded, but the index referencing the layer wasn't written yet.
continue;
}
}
let orphan_path = remote_layer_path(
&tenant_id,
&timeline_id,
shard_index,
&layer_file,
generation,
);
tracing::info!("Orphan layer detected: {orphan_path}");
summary.notify_timeline_orphan(&ttid);
}
}
// Iterate through all the timeline results. These are in key-order, so
// all results for the same tenant will be adjacent. We accumulate these,
// and then call `analyze_tenant` to flush, when we see the next tenant ID.
let mut summary = MetadataSummary::new();
pin_mut!(timelines);
while let Some(i) = timelines.next().await {
let (ttid, data) = i?;
summary.update_data(&data);
match tenant_id {
None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
Some(prev_tenant_id) => {
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
let tenant_objects = std::mem::take(&mut tenant_objects);
let timelines = std::mem::take(&mut tenant_timeline_results);
analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
}
}
}
let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
if let BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation: _index_part_generation,
s3_layers,
} = &data.blob_data
{
tenant_objects.push(ttid, s3_layers.clone());
}
tenant_timeline_results.push((ttid, data));
}
if !tenant_timeline_results.is_empty() {
analyze_tenant(
tenant_id.expect("Must be set if results are present"),
&mut summary,
tenant_objects,
tenant_timeline_results,
);
summary.update_analysis(&ttid, &analysis);
}
Ok(summary)

View File

@@ -1,4 +1,2 @@
result
*.json
hosts
poetry.lock

View File

@@ -1,11 +0,0 @@
[defaults]
host_key_checking = False
inventory=./hosts
remote_tmp=/tmp
remote_user=developer
callbacks_enabled = profile_tasks
[ssh_connection]
scp_if_ssh = True
ssh_args = -F ./ssh.cfg
pipelining = True

View File

@@ -1,16 +0,0 @@
[tool.poetry]
name = "sk-collect-dumps"
version = "0.1.0"
description = ""
authors = ["Arseny Sher <sher-ars@yandex.ru>"]
readme = "README.md"
packages = [{include = "sk_collect_dumps"}]
[tool.poetry.dependencies]
python = "^3.11"
ansible = "^9.1.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

View File

@@ -1,43 +1,25 @@
# Collect /v1/debug_dump from all safekeeper nodes
3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
```
# staging:
AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# prod:
AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
# check
echo $AUTH_TOKEN
```
2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
There are two ways to do that, with ssm or tsh. ssm:
```
# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
```
It will put the results to .results directory *near the playbook*.
## How to use ansible (staging)
tsh:
Update the inventory, if needed, selecting .build/.tech and optionally region:
```
rm -f hosts && echo '[safekeeper]' >> hosts
# staging:
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
# prod:
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
```
Test ansible connection:
## How to use ansible (prod)
```
ansible all -m ping -v
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
```
Download the dumps:
```
mkdir -p result && rm -f result/*
ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
```
3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.

View File

@@ -1,37 +1,18 @@
- name: Fetch state dumps from safekeepers
hosts: safekeeper
hosts: safekeepers
gather_facts: False
remote_user: "{{ remote_user }}"
tasks:
- name: Dump file
- name: Download file
get_url:
url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
dest: "/tmp/{{ inventory_hostname }}-dump.json"
headers:
Authorization: "Bearer {{ auth_token }}"
dest: "/tmp/{{ inventory_hostname }}.json"
- name: install rsync
ansible.builtin.apt:
name: rsync
update_cache: yes
become: yes
ignore_errors: true # it can be already installed and we don't always have sudo
- name: Fetch file from remote hosts (works only with ssm)
- name: Fetch file from remote hosts
fetch:
src: "/tmp/{{ inventory_hostname }}-dump.json"
dest: "./result/{{ inventory_hostname }}-dump.json"
src: "/tmp/{{ inventory_hostname }}.json"
dest: "./result/{{ inventory_hostname }}.json"
flat: yes
fail_on_missing: no
when: ansible_connection == "aws_ssm"
# xxx not sure how to make ansible 'synchronize' work with tsh
- name: Fetch file from remote hosts
shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json"
delegate_to: localhost
when: ansible_connection != "aws_ssm"
- name: remove remote dumps
ansible.builtin.file:
path: "/tmp/{{ inventory_hostname }}-dump.json"
state: absent

View File

@@ -1,13 +0,0 @@
# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
# Common flags for all teleport.aws.neon.tech hosts
Host *
HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
# Flags for all teleport.aws.neon.tech hosts except the proxy
Host * !teleport.aws.neon.tech
Port 3022
ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
User developer
# End generated Teleport configuration

View File

@@ -31,22 +31,22 @@ SELECT
(data->>'tenant_id') AS tenant_id,
(data->>'timeline_id') AS timeline_id,
(data->'memory'->>'active')::bool AS active,
(data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
(data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
(data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
(data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
(data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
(data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
(data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
(data->'memory'->>'write_lsn')::bigint AS write_lsn,
(data->'memory'->>'num_computes')::bigint AS num_computes,
(data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
(data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
(data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
(data->'memory'->>'is_cancelled')::bool AS is_cancelled,
(data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
(data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
(data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
(data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
(data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
(data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
(data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
(data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
(data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
(data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
(data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
(data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
(data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
FROM tmp_json
EOF

View File

@@ -16,7 +16,6 @@ class Metrics:
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
filter = filter or {}
res = []
for sample in self.metrics[name]:
try:
if all(sample.labels[k] == v for k, v in filter.items()):

View File

@@ -60,7 +60,7 @@ from fixtures.remote_storage import (
default_remote_storage,
remote_storage_to_toml_inline_table,
)
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
allure_add_grafana_links,
@@ -457,6 +457,7 @@ class NeonEnvBuilder:
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
self.enable_generations = True
self.scrub_on_exit = False
self.test_output_dir = test_output_dir
@@ -481,8 +482,6 @@ class NeonEnvBuilder:
self,
initial_tenant_conf: Optional[Dict[str, str]] = None,
default_remote_storage_if_missing: bool = True,
initial_tenant_shard_count: Optional[int] = None,
initial_tenant_shard_stripe_size: Optional[int] = None,
) -> NeonEnv:
"""
Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline.
@@ -500,11 +499,7 @@ class NeonEnvBuilder:
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
tenant_id=env.initial_tenant,
conf=initial_tenant_conf,
timeline_id=env.initial_timeline,
shard_count=initial_tenant_shard_count,
shard_stripe_size=initial_tenant_shard_stripe_size,
tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
)
assert env.initial_tenant == initial_tenant
assert env.initial_timeline == initial_timeline
@@ -682,7 +677,8 @@ class NeonEnvBuilder:
pageserver.stop(immediate=True)
self.env.attachment_service.stop(immediate=True)
if self.env.attachment_service is not None:
self.env.attachment_service.stop(immediate=True)
cleanup_error = None
@@ -776,9 +772,13 @@ class NeonEnv:
self.initial_tenant = config.initial_tenant
self.initial_timeline = config.initial_timeline
attachment_service_port = self.port_distributor.get_port()
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
if config.enable_generations:
attachment_service_port = self.port_distributor.get_port()
self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
else:
self.control_plane_api = None
self.attachment_service = None
# Create a config file corresponding to the options
cfg: Dict[str, Any] = {
@@ -851,7 +851,8 @@ class NeonEnv:
# Start up broker, pageserver and all safekeepers
self.broker.try_start()
self.attachment_service.start()
if self.attachment_service is not None:
self.attachment_service.start()
for pageserver in self.pageservers:
pageserver.start()
@@ -1127,29 +1128,15 @@ class AbstractNeonCli(abc.ABC):
env_vars[var] = val
# Intercept CalledProcessError and print more info
try:
res = subprocess.run(
args,
env=env_vars,
check=False,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
)
except subprocess.TimeoutExpired as e:
if e.stderr:
stderr = e.stderr.decode(errors="replace")
else:
stderr = ""
if e.stdout:
stdout = e.stdout.decode(errors="replace")
else:
stdout = ""
log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
raise
res = subprocess.run(
args,
env=env_vars,
check=False,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
)
indent = " "
if not res.returncode:
@@ -1200,8 +1187,6 @@ class NeonCli(AbstractNeonCli):
tenant_id: Optional[TenantId] = None,
timeline_id: Optional[TimelineId] = None,
conf: Optional[Dict[str, str]] = None,
shard_count: Optional[int] = None,
shard_stripe_size: Optional[int] = None,
set_default: bool = False,
) -> Tuple[TenantId, TimelineId]:
"""
@@ -1229,12 +1214,6 @@ class NeonCli(AbstractNeonCli):
if set_default:
args.append("--set-default")
if shard_count is not None:
args.extend(["--shard-count", str(shard_count)])
if shard_stripe_size is not None:
args.extend(["--shard-stripe-size", str(shard_stripe_size)])
res = self.raw_cli(args)
res.check_returncode()
return tenant_id, timeline_id
@@ -1555,19 +1534,6 @@ class NeonCli(AbstractNeonCli):
return self.raw_cli(args, check_return_code=True)
def tenant_migrate(
self, tenant_shard_id: TenantShardId, new_pageserver: int, timeout_secs: Optional[int]
):
args = [
"tenant",
"migrate",
"--tenant-id",
str(tenant_shard_id),
"--id",
str(new_pageserver),
]
return self.raw_cli(args, check_return_code=True, timeout=timeout_secs)
def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]":
return self.raw_cli(["start"], check_return_code=check_return_code)
@@ -1663,66 +1629,6 @@ class NeonAttachmentService:
else:
return None
def node_register(self, node: NeonPageserver):
body = {
"node_id": int(node.id),
"listen_http_addr": "localhost",
"listen_http_port": node.service_port.http,
}
log.info(f"node_register({body})")
requests.post(f"{self.env.control_plane_api}/node", json=body).raise_for_status()
def tenant_create(
self,
tenant_id: TenantId,
shard_count: Optional[int] = None,
shard_stripe_size: Optional[int] = None,
tenant_config: Optional[Dict[Any, Any]] = None,
):
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
if shard_count is not None:
shard_params = {"count": shard_count}
if shard_stripe_size is not None:
shard_params["stripe_size"] = shard_stripe_size
body["shard_parameters"] = shard_params
if tenant_config is not None:
for k, v in tenant_config.items():
body[k] = v
response = requests.post(f"{self.env.control_plane_api}/tenant", json=body)
response.raise_for_status()
log.info(f"tenant_create success: {response.json()}")
def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
response = requests.post(
f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
)
response.raise_for_status()
log.info(f"tenant_timeline_create success: {response.json()}")
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
response = requests.get(f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
response.raise_for_status()
body = response.json()
shards: list[dict[str, Any]] = body["shards"]
return shards
def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
response = requests.put(
f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
json={"new_shard_count": shard_count},
)
response.raise_for_status()
body = response.json()
log.info(f"tenant_shard_split success: {body}")
shards: list[TenantShardId] = body["new_shards"]
return shards
def __enter__(self) -> "NeonAttachmentService":
return self
@@ -1928,19 +1834,20 @@ class NeonPageserver(PgProtocol):
"""
client = self.http_client()
return client.tenant_attach(
tenant_id,
config,
config_null,
generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
)
def tenant_detach(self, tenant_id: TenantId):
self.env.attachment_service.attach_hook_drop(tenant_id)
if self.env.attachment_service is not None:
self.env.attachment_service.attach_hook_drop(tenant_id)
client = self.http_client()
return client.tenant_detach(tenant_id)
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
# This API is only for use when generations are enabled
assert self.env.attachment_service is not None
if config["mode"].startswith("Attached") and "generation" not in config:
config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
@@ -1966,15 +1873,26 @@ class NeonPageserver(PgProtocol):
generation: Optional[int] = None,
) -> TenantId:
if generation is None:
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
generation = self.maybe_get_generation(tenant_id)
client = self.http_client(auth_token=auth_token)
return client.tenant_create(tenant_id, conf, generation=generation)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
return client.tenant_load(
tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
)
return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
def maybe_get_generation(self, tenant_id: TenantId):
"""
For tests that would like to use an HTTP client directly instead of using
the `tenant_attach` and `tenant_create` helpers here: issue a generation
number for a tenant.
Returns None if the attachment service is not enabled (legacy mode)
"""
if self.env.attachment_service is not None:
return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
else:
return None
def append_pageserver_param_overrides(
@@ -3248,7 +3166,7 @@ def pytest_addoption(parser: Parser):
SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
)
@@ -3344,7 +3262,9 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
# pg is the existing and running compute node, that we want to compare with a basebackup
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
def check_restored_datadir_content(
test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None
):
# Get the timeline ID. We need it for the 'basebackup' command
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
@@ -3365,7 +3285,6 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
cmd = rf"""
{psql_path} \
--no-psqlrc \
@@ -3434,27 +3353,6 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -
time.sleep(0.5)
def tenant_get_shards(
env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None
) -> list[tuple[TenantShardId, NeonPageserver]]:
"""
Helper for when you want to talk to one or more pageservers, and the
caller _might_ have specified a pageserver, or they might leave it to
us to figure out the shards for a tenant.
Caller should over the response to apply their per-pageserver action to
each shard
"""
if len(env.pageservers) > 1:
return [
(TenantShardId.parse(s["shard_id"]), env.get_pageserver(s["node_id"]))
for s in env.attachment_service.locate(tenant_id)
]
else:
# Assume an unsharded tenant
return [(TenantShardId(tenant_id, 0, 0), env.pageserver)]
def wait_for_last_flush_lsn(
env: NeonEnv,
endpoint: Endpoint,
@@ -3464,22 +3362,10 @@ def wait_for_last_flush_lsn(
) -> Lsn:
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
shards = tenant_get_shards(env, tenant, pageserver_id)
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
results = []
for tenant_shard_id, pageserver in shards:
log.info(f"wait_for_last_flush_lsn: shard {tenant_shard_id}")
waited = wait_for_last_record_lsn(
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
)
assert waited >= last_flush_lsn
results.append(waited)
# Return the lowest LSN that has been ingested by all shards
return min(results)
return wait_for_last_record_lsn(
env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
)
def wait_for_wal_insert_lsn(
@@ -3491,16 +3377,9 @@ def wait_for_wal_insert_lsn(
) -> Lsn:
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
result = None
for tenant_shard_id, pageserver in tenant_get_shards(env, tenant, pageserver_id):
shard_r = wait_for_last_record_lsn(
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
)
if result is None:
result = shard_r
assert result is not None
return result
return wait_for_last_record_lsn(
env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn
)
def fork_at_current_lsn(
@@ -3534,13 +3413,11 @@ def last_flush_lsn_upload(
last_flush_lsn = wait_for_last_flush_lsn(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id
)
shards = tenant_get_shards(env, tenant_id, pageserver_id)
for tenant_shard_id, pageserver in shards:
ps_http = pageserver.http_client()
wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(tenant_shard_id, timeline_id)
wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn)
ps_http = env.get_pageserver(pageserver_id).http_client()
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
return last_flush_lsn

View File

@@ -13,7 +13,7 @@ from urllib3.util.retry import Retry
from fixtures.log_helper import log
from fixtures.metrics import Metrics, parse_metrics
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import Fn
@@ -433,7 +433,7 @@ class PageserverHttpClient(requests.Session):
def timeline_detail(
self,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
include_non_incremental_logical_size: bool = False,
include_timeline_dir_layer_file_size_sum: bool = False,
@@ -455,7 +455,7 @@ class PageserverHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def timeline_delete(self, tenant_id: TenantShardId, timeline_id: TimelineId, **kwargs):
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
"""
Note that deletion is not instant, it is scheduled and performed mostly in the background.
So if you need to wait for it to complete use `timeline_delete_wait_completed`.
@@ -469,7 +469,7 @@ class PageserverHttpClient(requests.Session):
assert res_json is None
def timeline_gc(
self, tenant_id: TenantShardId, timeline_id: TimelineId, gc_horizon: Optional[int]
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
) -> dict[str, Any]:
"""
Unlike most handlers, this will wait for the layers to be actually
@@ -540,7 +540,7 @@ class PageserverHttpClient(requests.Session):
return res_json
def timeline_checkpoint(
self, tenant_id: TenantShardId, timeline_id: TimelineId, force_repartition=False
self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False
):
self.is_testing_enabled_or_skip()
query = {}
@@ -682,34 +682,6 @@ class PageserverHttpClient(requests.Session):
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def layer_map_info(
self,
tenant_id: TenantId,

View File

@@ -6,7 +6,7 @@ from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
from fixtures.log_helper import log
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.remote_storage import RemoteStorageKind, S3Storage
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -22,7 +22,7 @@ def assert_tenant_state(
def remote_consistent_lsn(
pageserver_http: PageserverHttpClient, tenant: TenantShardId, timeline: TimelineId
pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
) -> Lsn:
detail = pageserver_http.timeline_detail(tenant, timeline)
@@ -39,7 +39,7 @@ def remote_consistent_lsn(
def wait_for_upload(
pageserver_http: PageserverHttpClient,
tenant: TenantShardId,
tenant: TenantId,
timeline: TimelineId,
lsn: Lsn,
):
@@ -92,7 +92,7 @@ def wait_until_tenant_state(
def wait_until_timeline_state(
pageserver_http: PageserverHttpClient,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
expected_state: str,
iterations: int,
@@ -141,7 +141,7 @@ def wait_until_tenant_active(
def last_record_lsn(
pageserver_http_client: PageserverHttpClient, tenant: TenantShardId, timeline: TimelineId
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
) -> Lsn:
detail = pageserver_http_client.timeline_detail(tenant, timeline)
@@ -152,7 +152,7 @@ def last_record_lsn(
def wait_for_last_record_lsn(
pageserver_http: PageserverHttpClient,
tenant: TenantShardId,
tenant: TenantId,
timeline: TimelineId,
lsn: Lsn,
) -> Lsn:
@@ -194,7 +194,7 @@ def wait_for_upload_queue_empty(
def wait_timeline_detail_404(
pageserver_http: PageserverHttpClient,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
iterations: int,
interval: Optional[float] = None,
@@ -219,7 +219,7 @@ def wait_timeline_detail_404(
def timeline_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
iterations: int = 20,
interval: Optional[float] = None,

View File

@@ -125,51 +125,3 @@ class TenantId(Id):
class TimelineId(Id):
def __repr__(self) -> str:
return f'TimelineId("{self.id.hex()}")'
# Workaround for compat with python 3.9, which does not have `typing.Self`
TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
class TenantShardId:
def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
self.tenant_id = tenant_id
self.shard_number = shard_number
self.shard_count = shard_count
assert self.shard_number < self.shard_count or self.shard_count == 0
@classmethod
def parse(cls: Type[TTenantShardId], input) -> TTenantShardId:
if len(input) == 32:
return cls(
tenant_id=TenantId(input),
shard_number=0,
shard_count=0,
)
elif len(input) == 37:
return cls(
tenant_id=TenantId(input[0:32]),
shard_number=int(input[33:35], 16),
shard_count=int(input[35:37], 16),
)
else:
raise ValueError(f"Invalid TenantShardId '{input}'")
def __str__(self):
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
def _tuple(self) -> tuple[TenantId, int, int]:
return (self.tenant_id, self.shard_number, self.shard_count)
def __lt__(self, other) -> bool:
if not isinstance(other, type(self)):
return NotImplemented
return self._tuple() < other._tuple()
def __eq__(self, other) -> bool:
if not isinstance(other, type(self)):
return NotImplemented
return self._tuple() == other._tuple()
def __hash__(self) -> int:
return hash(self._tuple())

View File

@@ -5,7 +5,6 @@ from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
last_flush_lsn_upload,
tenant_get_shards,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
@@ -32,7 +31,7 @@ class Workload:
self._endpoint: Optional[Endpoint] = None
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
def endpoint(self, pageserver_id: int) -> Endpoint:
if self._endpoint is None:
self._endpoint = self.env.endpoints.create(
"main",
@@ -55,7 +54,7 @@ class Workload:
if self._endpoint is not None:
self._endpoint.stop()
def init(self, pageserver_id: Optional[int] = None):
def init(self, pageserver_id: int):
endpoint = self.endpoint(pageserver_id)
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
@@ -64,7 +63,7 @@ class Workload:
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def write_rows(self, n, pageserver_id: Optional[int] = None):
def write_rows(self, n, pageserver_id):
endpoint = self.endpoint(pageserver_id)
start = self.expect_rows
end = start + n - 1
@@ -82,7 +81,7 @@ class Workload:
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
def churn_rows(self, n, pageserver_id, upload=True):
assert self.expect_rows >= n
max_iters = 10
@@ -120,24 +119,21 @@ class Workload:
]
)
for tenant_shard_id, pageserver in tenant_get_shards(
self.env, self.tenant_id, pageserver_id
):
last_flush_lsn = wait_for_last_flush_lsn(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
ps_http = pageserver.http_client()
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
last_flush_lsn = wait_for_last_flush_lsn(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
ps_http = self.env.get_pageserver(pageserver_id).http_client()
wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
if upload:
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
else:
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
if upload:
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
else:
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
def validate(self, pageserver_id: Optional[int] = None):
def validate(self, pageserver_id):
endpoint = self.endpoint(pageserver_id)
result = endpoint.safe_psql_many(
[

View File

@@ -61,6 +61,7 @@ def measure_recovery_time(env: NeonCompare):
# of view, but the same as far as the safekeeper/WAL is concerned. To work around that,
# we will explicitly create the tenant in the same generation that it was previously
# attached in.
assert env.env.attachment_service is not None
attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
assert attach_status is not None
(attach_gen, _) = attach_status

View File

@@ -151,9 +151,7 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
An OLAP-style ClickHouse benchmark
Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
The DB prepared manually in advance.
Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;`
to ensure that Postgres optimizer chooses the same plans as RDS and Aurora.
The DB prepared manually in advance
"""
explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"

View File

@@ -136,7 +136,10 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
ps_http.tenant_detach(tenant_id)
assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
body = {}
gen = env.pageserver.maybe_get_generation(tenant_id)
if gen is not None:
body["generation"] = gen
ps_http.post(
f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",

View File

@@ -87,6 +87,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
#
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
# about to start as the attached pageserver
assert env.attachment_service is not None
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
env.pageservers[0].start()
env.pageservers[1].stop()

View File

@@ -1,7 +1,6 @@
import enum
import time
from dataclasses import dataclass
from typing import Any, Dict, Tuple
from typing import Dict, Tuple
import pytest
import toml
@@ -65,23 +64,6 @@ def test_min_resident_size_override_handling(
assert_config(tenant_id, None, config_level_override)
@enum.unique
class EvictionOrder(str, enum.Enum):
ABSOLUTE_ORDER = "absolute"
RELATIVE_ORDER_EQUAL = "relative_equal"
RELATIVE_ORDER_SPARE = "relative_spare"
def config(self) -> Dict[str, Any]:
if self == EvictionOrder.ABSOLUTE_ORDER:
return {"type": "AbsoluteAccessed"}
elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}}
elif self == EvictionOrder.RELATIVE_ORDER_SPARE:
return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}}
else:
raise RuntimeError(f"not implemented: {self}")
@dataclass
class EvictionEnv:
timelines: list[Tuple[TenantId, TimelineId]]
@@ -126,14 +108,13 @@ class EvictionEnv:
_avg = cur.fetchone()
def pageserver_start_with_disk_usage_eviction(
self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder
self, period, max_usage_pct, min_avail_bytes, mock_behavior
):
disk_usage_config = {
"period": period,
"max_usage_pct": max_usage_pct,
"min_avail_bytes": min_avail_bytes,
"mock_statvfs": mock_behavior,
"eviction_order": eviction_order.config(),
}
enc = toml.TomlEncoder()
@@ -289,13 +270,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_evicts_until_pressure_is_relieved(
eviction_env: EvictionEnv, order: EvictionOrder
):
def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
"""
Basic test to ensure that we evict enough to relieve pressure.
"""
@@ -306,9 +281,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(
target = total_on_disk // 2
response = pageserver_http.disk_usage_eviction_run(
{"evict_bytes": target, "eviction_order": order.config()}
)
response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
@@ -323,13 +296,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_respects_overridden_resident_size(
eviction_env: EvictionEnv, order: EvictionOrder
):
def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
"""
Override tenant min resident and ensure that it will be respected by eviction.
"""
@@ -369,9 +336,7 @@ def test_pageserver_respects_overridden_resident_size(
env.warm_up_tenant(large_tenant[0])
# do one run
response = ps_http.disk_usage_eviction_run(
{"evict_bytes": target, "eviction_order": order.config()}
)
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
time.sleep(1) # give log time to flush
@@ -400,11 +365,7 @@ def test_pageserver_respects_overridden_resident_size(
assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
"""
If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
we should continue to evict layers following global LRU.
@@ -415,9 +376,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
(total_on_disk, _, _) = env.timelines_du()
target = total_on_disk
response = ps_http.disk_usage_eviction_run(
{"evict_bytes": target, "eviction_order": order.config()}
)
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
@@ -430,15 +389,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
@pytest.mark.parametrize(
"order",
[
EvictionOrder.ABSOLUTE_ORDER,
EvictionOrder.RELATIVE_ORDER_EQUAL,
EvictionOrder.RELATIVE_ORDER_SPARE,
],
)
def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
def test_partial_evict_tenant(eviction_env: EvictionEnv):
"""
Warm up a tenant, then build up pressure to cause in evictions in both.
We expect
@@ -451,7 +402,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
(total_on_disk, _, _) = env.timelines_du()
du_by_timeline = env.du_by_timeline()
# pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
# pick any tenant
[warm, cold] = list(du_by_timeline.keys())
(tenant_id, timeline_id) = warm
@@ -462,9 +413,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
# but not enough to fall into global LRU.
# So, set target to all occupied space, except 2*env.layer_size per tenant
target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
response = ps_http.disk_usage_eviction_run(
{"evict_bytes": target, "eviction_order": order.config()}
)
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
@@ -479,32 +428,28 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
), "all tenants should have lost some layers"
warm_size = later_du_by_timeline[warm]
# bounds for warmed_size
warm_lower = 0.5 * du_by_timeline[warm]
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
# So, check for up to 3 here.
warm_upper = warm_lower + 3 * env.layer_size
cold_size = later_du_by_timeline[cold]
cold_upper = 2 * env.layer_size
if order == EvictionOrder.ABSOLUTE_ORDER:
# bounds for warmed_size
warm_lower = 0.5 * du_by_timeline[warm]
log.info(
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
)
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
# So, check for up to 3 here.
warm_upper = warm_lower + 3 * env.layer_size
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
cold_upper = 2 * env.layer_size
log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
log.info(
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
)
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
assert (
cold_size < cold_upper
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
else:
# just go with the space was freed, find proper limits later
pass
assert (
cold_size < cold_upper
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
def poor_mans_du(
@@ -556,7 +501,6 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
"type": "Failure",
"mocked_error": "EIO",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
@@ -589,7 +533,6 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
# This avoids accounting for metadata files & tenant conf in the tests.
"name_filter": ".*__.*",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
def relieved_log_message():
@@ -630,7 +573,6 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
# This avoids accounting for metadata files & tenant conf in the tests.
"name_filter": ".*__.*",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
def relieved_log_message():

View File

@@ -157,6 +157,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
def get_generation_number():
assert env.attachment_service is not None
attachment = env.attachment_service.inspect(tenant_id)
assert attachment is not None
return attachment[0]

View File

@@ -72,9 +72,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
# create new tenant and check it is also there
tenant_id = TenantId.generate()
client.tenant_create(
tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
)
client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
timelines = client.timeline_list(tenant_id)

View File

@@ -187,6 +187,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
- After upgrade, the bucket should contain a mixture.
- In both cases, postgres I/O should work.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
@@ -195,6 +196,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
env.broker.try_start()
for sk in env.safekeepers:
sk.start()
assert env.attachment_service is not None
env.attachment_service.start()
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
@@ -260,10 +262,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
some_other_pageserver = 1234
ps_http = env.pageserver.http_client()
@@ -337,6 +341,7 @@ def test_deletion_queue_recovery(
:param validate_before: whether to wait for deletions to be validated before restart. This
makes them elegible to be executed after restart, if the same node keeps the attachment.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
@@ -400,6 +405,7 @@ def test_deletion_queue_recovery(
if keep_attachment == KeepAttachment.LOSE:
some_other_pageserver = 101010
assert env.attachment_service is not None
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
env.pageserver.start()
@@ -447,6 +453,7 @@ def test_deletion_queue_recovery(
def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
@@ -466,6 +473,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
)
# Simulate a major incident: the control plane goes offline
assert env.attachment_service is not None
env.attachment_service.stop()
# Remember how many validations had happened before the control plane went offline
@@ -537,6 +545,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
and must be constructed using the proper generation for the layer, which may not be the same generation
that the tenant is running in.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
@@ -566,6 +575,7 @@ def test_multi_attach(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,

View File

@@ -1,6 +1,4 @@
import random
from contextlib import closing
from typing import Optional
import pytest
from fixtures.log_helper import log
@@ -11,7 +9,9 @@ from fixtures.utils import wait_until
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("generations", [True, False])
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
neon_env_builder.enable_generations = generations
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
@@ -143,24 +143,18 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
# Test that repeatedly kills and restarts the page server, while the
# safekeeper and compute node keep running.
@pytest.mark.timeout(540)
@pytest.mark.parametrize("shard_count", [None, 4])
def test_pageserver_chaos(
neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
):
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
if build_type == "debug":
pytest.skip("times out in debug builds")
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
env = neon_env_builder.init_start()
# these can happen, if we shutdown at a good time. to be fixed as part of #5172.
message = ".*duplicated L1 layer layer=.*"
for ps in env.pageservers:
ps.allowed_errors.append(message)
env.pageserver.allowed_errors.append(message)
# Use a tiny checkpoint distance, to create a lot of layers quickly.
# That allows us to stress the compaction and layer flushing logic more.
@@ -200,19 +194,13 @@ def test_pageserver_chaos(
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
# We run "random" kills using a fixed seed, to improve reproducibility if a test
# failure is related to a particular order of operations.
seed = 0xDEADBEEF
rng = random.Random(seed)
# Update the whole table, then immediately kill and restart the pageserver
for i in range(1, 15):
endpoint.safe_psql("UPDATE foo set updates = updates + 1")
# This kills the pageserver immediately, to simulate a crash
to_kill = rng.choice(env.pageservers)
to_kill.stop(immediate=True)
to_kill.start()
env.pageserver.stop(immediate=True)
env.pageserver.start()
# Check that all the updates are visible
num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]

View File

@@ -57,11 +57,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
states are valid, so that we may test it in this way: the API should always
work as long as the tenant exists.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
pageservers = env.pageservers
list([p.http_client() for p in pageservers])
@@ -208,11 +210,13 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
"""
Test the sequence of location states that are used in a live migration.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

View File

@@ -2,40 +2,25 @@
# This file runs pg_regress-based tests.
#
from pathlib import Path
from typing import Optional
import pytest
from fixtures.neon_fixtures import (
NeonEnvBuilder,
check_restored_datadir_content,
)
from fixtures.remote_storage import s3_storage
from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
# Run the main PostgreSQL regression tests, in src/test/regress.
#
@pytest.mark.parametrize("shard_count", [None, 4])
def test_pg_regress(
neon_env_builder: NeonEnvBuilder,
neon_simple_env: NeonEnv,
test_output_dir: Path,
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],
):
"""
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
many shards.
"""
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
env = neon_simple_env
env.neon_cli.create_branch("test_pg_regress", "empty")
# Connect to postgres and create a database called "regression".
endpoint = env.endpoints.create_start("main")
endpoint = env.endpoints.create_start("test_pg_regress")
endpoint.safe_psql("CREATE DATABASE regression")
# Create some local directories for pg_regress to run in.
@@ -76,25 +61,22 @@ def test_pg_regress(
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
#
@pytest.mark.parametrize("shard_count", [None, 4])
def test_isolation(
neon_env_builder: NeonEnvBuilder,
neon_simple_env: NeonEnv,
test_output_dir: Path,
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],
):
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
env = neon_simple_env
env.neon_cli.create_branch("test_isolation", "empty")
# Connect to postgres and create a database called "regression".
# isolation tests use prepared transactions, so enable them
endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
endpoint = env.endpoints.create_start(
"test_isolation", config_lines=["max_prepared_transactions=100"]
)
endpoint.safe_psql("CREATE DATABASE isolation_regression")
# Create some local directories for pg_isolation_regress to run in.
@@ -132,24 +114,19 @@ def test_isolation(
# Run extra Neon-specific pg_regress-based tests. The tests and their
# schedule file are in the sql_regress/ directory.
@pytest.mark.parametrize("shard_count", [None, 4])
def test_sql_regress(
neon_env_builder: NeonEnvBuilder,
neon_simple_env: NeonEnv,
test_output_dir: Path,
pg_bin,
capsys,
base_dir: Path,
pg_distrib_dir: Path,
shard_count: Optional[int],
):
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
env = neon_simple_env
env.neon_cli.create_branch("test_sql_regress", "empty")
# Connect to postgres and create a database called "regression".
endpoint = env.endpoints.create_start("main")
endpoint = env.endpoints.create_start("test_sql_regress")
endpoint.safe_psql("CREATE DATABASE regression")
# Create some local directories for pg_regress to run in.

View File

@@ -60,6 +60,8 @@ def test_remote_storage_backup_and_restore(
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
neon_env_builder.enable_generations = generations
# Exercise retry code path by making all uploads and downloads fail for the
# first time. The retries print INFO-messages to the log; we will check
# that they are present after the test.

View File

@@ -1,144 +0,0 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
tenant_get_shards,
)
from fixtures.remote_storage import s3_storage
from fixtures.workload import Workload
def test_sharding_smoke(
neon_env_builder: NeonEnvBuilder,
):
"""
Test the basic lifecycle of a sharded tenant:
- ingested data gets split up
- page service reads
- timeline creation and deletion
- splits
"""
# We will start with 4 shards and split into 8, then migrate all those
# 8 shards onto separate pageservers
shard_count = 4
split_shard_count = 8
neon_env_builder.num_pageservers = split_shard_count
# 1MiB stripes: enable getting some meaningful data distribution without
# writing large quantities of data in this test.
stripe_size = 128
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
neon_env_builder.preserve_database_files = True
env = neon_env_builder.init_start(
initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
for ps in env.pageservers:
ps.allowed_errors.extend(
[
# FIXME: during a split, control plane should respond affirmatively to validation requests
# that refer to a shard that no longer exists, but has a child shard.
".*Dropped remote consistent LSN updates.*",
# FIXME: improve logging in the pageserver so that this isn't considered an erorr, or
# figure out how to make the migration even more seamless.
".*Tenant.*is not active.*",
]
)
# TODO: do some timeline creations & deletions on the sharded tenant
# TODO: validate that timeline APIs show the created timelines on all shards
workload = Workload(env, tenant_id, timeline_id)
workload.init()
workload.write_rows(1000)
# Note which pageservers initially hold a shard after tenant creation
pre_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
# For pageservers holding a shard, validate their ingest statistics
# reflect a proper splitting of the WAL.
for pageserver in env.pageservers:
if pageserver.id not in pre_split_pageserver_ids:
continue
metrics = pageserver.http_client().get_metrics_values(
[
"pageserver_wal_ingest_records_received_total",
"pageserver_wal_ingest_records_committed_total",
"pageserver_wal_ingest_records_filtered_total",
]
)
log.info(f"Pageserver {pageserver.id} metrics: {metrics}")
# Not everything received was committed
assert (
metrics["pageserver_wal_ingest_records_received_total"]
> metrics["pageserver_wal_ingest_records_committed_total"]
)
# Something was committed
assert metrics["pageserver_wal_ingest_records_committed_total"] > 0
# Counts are self consistent
assert (
metrics["pageserver_wal_ingest_records_received_total"]
== metrics["pageserver_wal_ingest_records_committed_total"]
+ metrics["pageserver_wal_ingest_records_filtered_total"]
)
# TODO: validate that shards have different sizes
workload.validate()
assert len(pre_split_pageserver_ids) == 4
env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
post_split_pageserver_ids = [loc["node_id"] for loc in env.attachment_service.locate(tenant_id)]
# We should have split into 8 shards, on the same 4 pageservers we started on.
assert len(post_split_pageserver_ids) == split_shard_count
assert len(set(post_split_pageserver_ids)) == shard_count
assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids)
workload.validate()
workload.churn_rows(1000)
workload.validate()
# Run GC on all new shards, to check they don't barf or delete anything that breaks reads
# (compaction was already run as part of churn_rows)
all_shards = tenant_get_shards(env, tenant_id)
for tenant_shard_id, pageserver in all_shards:
pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None)
# Restart all nodes, to check that the newly created shards are durable
for ps in env.pageservers:
ps.restart()
workload.validate()
migrate_to_pageserver_ids = list(
set(p.id for p in env.pageservers) - set(pre_split_pageserver_ids)
)
assert len(migrate_to_pageserver_ids) == split_shard_count - shard_count
# Migrate shards away from the node where the split happened
for ps_id in pre_split_pageserver_ids:
shards_here = [
tenant_shard_id
for (tenant_shard_id, pageserver) in all_shards
if pageserver.id == ps_id
]
assert len(shards_here) == 2
migrate_shard = shards_here[0]
destination = migrate_to_pageserver_ids.pop()
log.info(f"Migrating shard {migrate_shard} from {ps_id} to {destination}")
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)

View File

@@ -263,6 +263,15 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
ps_http, env.initial_tenant, timeline_id, iterations=iterations
)
if failpoint == "timeline-delete-after-index-delete":
m = ps_http.get_metrics()
assert (
m.query_one(
"remote_storage_s3_request_seconds_count",
filter={"request_type": "get_object", "result": "ok"},
).value
== 1 # index part for initial timeline
)
elif check is Check.RETRY_WITHOUT_RESTART:
# this should succeed
# this also checks that delete can be retried even when timeline is in Broken state