Compare commits

..

6 Commits

Author SHA1 Message Date
Arpad Müller
80f68a0029 Use bounds() function 2024-07-02 14:57:20 +02:00
Arpad Müller
9a134a8f18 Fix tests 2024-07-02 12:11:46 +02:00
Arpad Müller
9290f57750 Return the entire buffer in BlobWriter as well 2024-07-02 01:28:27 +02:00
Arpad Müller
6202c84408 Actually return the same slice 2024-07-02 01:17:04 +02:00
Arpad Müller
85260b4905 Fix tests as well 2024-07-01 19:30:07 +02:00
Arpad Müller
bdedd2192b Use Slice<_> in write path instead of B: BoundedBuf<...> 2024-07-01 18:52:10 +02:00
171 changed files with 2817 additions and 6496 deletions

View File

@@ -114,8 +114,6 @@ runs:
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
export DEFAULT_PG_VERSION=${PG_VERSION#v}
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
if [ "${BUILD_TYPE}" = "remote" ]; then
export REMOTE_ENV=1
@@ -180,15 +178,7 @@ runs:
# Wake up the cluster if we use remote neon instance
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
QUERIES=("SELECT version()")
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
fi
for q in "${QUERIES[@]}"; do
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
done
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
fi
# Run the tests.

View File

@@ -99,14 +99,7 @@ jobs:
# Set --sparse-ordering option of pytest-order plugin
# to ensure tests are running in order of appears in the file.
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
extra_params:
-m remote_cluster
--sparse-ordering
--timeout 5400
--ignore test_runner/performance/test_perf_olap.py
--ignore test_runner/performance/test_perf_pgvector_queries.py
--ignore test_runner/performance/test_logical_replication.py
--ignore test_runner/performance/test_physical_replication.py
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -132,69 +125,6 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
replication-tests:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_logical_replication.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_physical_replication.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
generate-matrices:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -309,6 +239,11 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
id: create-neon-project
@@ -347,6 +282,16 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark init
uses: ./.github/actions/run-python-test-set
with:
@@ -432,13 +377,26 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Benchmark pgvector hnsw indexing
uses: ./.github/actions/run-python-test-set
with:
@@ -459,12 +417,12 @@ jobs:
test_selection: performance/test_perf_pgvector_queries.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600
extra_params: -m remote_cluster --timeout 21600
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -519,6 +477,11 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
@@ -540,6 +503,16 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: ClickBench benchmark
uses: ./.github/actions/run-python-test-set
with:
@@ -607,6 +580,11 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Get Connstring Secret Name
run: |
case "${PLATFORM}" in
@@ -635,6 +613,16 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run TPC-H benchmark
uses: ./.github/actions/run-python-test-set
with:
@@ -693,6 +681,11 @@ jobs:
path: /tmp/neon/
prefix: latest
- name: Add Postgres binaries to PATH
run: |
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
- name: Set up Connection String
id: set-up-connstr
run: |
@@ -714,6 +707,16 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
QUERIES=("SELECT version()")
if [[ "${PLATFORM}" = "neon"* ]]; then
QUERIES+=("SHOW neon.tenant_id")
QUERIES+=("SHOW neon.timeline_id")
fi
for q in "${QUERIES[@]}"; do
psql ${CONNSTR} -c "${q}"
done
- name: Run user examples
uses: ./.github/actions/run-python-test-set
with:

View File

@@ -63,16 +63,14 @@ jobs:
mkdir -p /tmp/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v3
- uses: docker/login-action@v2
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/build-push-action@v6
- uses: docker/build-push-action@v4
with:
context: .
provenance: false
@@ -84,7 +82,6 @@ jobs:
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf /tmp/.docker-custom

View File

@@ -30,7 +30,7 @@ jobs:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name }}
github-event-name: ${{ github.event_name}}
cancel-previous-e2e-tests:
needs: [ check-permissions ]
@@ -335,8 +335,6 @@ jobs:
- name: Run cargo build
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
# Do install *before* running rust tests because they might recompile the
@@ -385,11 +383,6 @@ jobs:
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
@@ -751,16 +744,14 @@ jobs:
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- uses: docker/build-push-action@v6
- uses: docker/build-push-action@v5
with:
context: .
build-args: |
@@ -831,12 +822,11 @@ jobs:
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
- uses: docker/setup-buildx-action@v2
with:
cache-binary: false
# Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
buildkitd-config-inline: |
config-inline: |
[worker.oci]
max-parallelism = 1
@@ -852,7 +842,7 @@ jobs:
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
- name: Build compute-node image
uses: docker/build-push-action@v6
uses: docker/build-push-action@v5
with:
context: .
build-args: |
@@ -871,7 +861,7 @@ jobs:
- name: Build neon extensions test image
if: matrix.version == 'v16'
uses: docker/build-push-action@v6
uses: docker/build-push-action@v5
with:
context: .
build-args: |
@@ -892,7 +882,7 @@ jobs:
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v16'
uses: docker/build-push-action@v6
uses: docker/build-push-action@v5
with:
target: compute-tools-image
context: .
@@ -1336,7 +1326,6 @@ jobs:
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/latest
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
# Update compatibility snapshot for the release
for pg_version in v14 v15 v16; do
@@ -1350,7 +1339,7 @@ jobs:
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
@@ -1369,31 +1358,3 @@ jobs:
with:
from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
secrets: inherit
# This job simplifies setting branch protection rules (in GitHub UI)
# by allowing to set only this job instead of listing many others.
# It also makes it easier to rename or parametrise jobs (using matrix)
# which requires changes in branch protection rules
#
# Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
#
# https://github.com/neondatabase/neon/settings/branch_protection_rules
conclusion:
if: always()
# Format `needs` differently to make the list more readable.
# Usually we do `needs: [...]`
needs:
- check-codestyle-python
- check-codestyle-rust
- regress-tests
- test-images
runs-on: ubuntu-22.04
steps:
# The list of possible results:
# https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
- name: Fail the job if any of the dependencies do not succeed
run: exit 1
if: |
contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')
|| contains(needs.*.result, 'skipped')

View File

@@ -232,19 +232,12 @@ jobs:
- name: Run cargo build
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
cargo nextest run $CARGO_FEATURES -j$(nproc)
# Run separate tests for real S3
@@ -385,7 +378,7 @@ jobs:
run: make walproposer-lib -j$(nproc)
- name: Produce the build stats
run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
run: cargo build --all --release --timings -j$(nproc)
- name: Upload the build stats
id: upload-stats

View File

@@ -1,155 +0,0 @@
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
workflow_dispatch: # Allows manual triggering of the workflow
inputs:
commit_hash:
type: string
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
required: false
default: ''
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
jobs:
trigger_bench_on_ec2_machine_in_eu_central_1:
runs-on: [ self-hosted, gen3, small ]
container:
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
timeout-minutes: 360 # Set the timeout to 6 hours
env:
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
RUN_ID: ${{ github.run_id }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
AWS_DEFAULT_REGION : "eu-central-1"
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
steps:
# we don't need the neon source code because we run everything remotely
# however we still need the local github actions to run the allure step below
- uses: actions/checkout@v4
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
run: curl https://ifconfig.me
- name: Start EC2 instance and wait for the instance to boot up
run: |
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
sleep 60 # sleep some time to allow cloudinit and our API server to start up
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
run: |
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
echo "Public IP of the EC2 instance: $public_ip"
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
- name: Determine commit hash
env:
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
run: |
if [ -z "$INPUT_COMMIT_HASH" ]; then
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
else
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
fi
- name: Start Bench with run_id
run: |
curl -k -X 'POST' \
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
- name: Poll Test Status
id: poll_step
run: |
status=""
while [[ "$status" != "failure" && "$status" != "success" ]]; do
response=$(curl -k -X 'GET' \
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H "Authorization: Bearer $API_KEY")
echo "Response: $response"
set +x
status=$(echo $response | jq -r '.status')
echo "Test status: $status"
if [[ "$status" == "failure" ]]; then
echo "Test failed"
exit 1 # Fail the job step if status is failure
elif [[ "$status" == "success" || "$status" == "null" ]]; then
break
elif [[ "$status" == "too_many_runs" ]]; then
echo "Too many runs already running"
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
exit 1
fi
sleep 60 # Poll every 60 seconds
done
- name: Retrieve Test Logs
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
run: |
curl -k -X 'GET' \
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-H 'accept: application/gzip' \
-H "Authorization: Bearer $API_KEY" \
--output "test_log_${GITHUB_RUN_ID}.gz"
- name: Unzip Test Log and Print it into this job's log
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
run: |
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
cat "test_log_${GITHUB_RUN_ID}"
- name: Create Allure report
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
- name: Cleanup Test Resources
if: always()
run: |
curl -k -X 'POST' \
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-H 'accept: application/json' \
-H "Authorization: Bearer $API_KEY" \
-d ''
- name: Stop EC2 instance and wait for the instance to be stopped
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
run: |
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID

View File

@@ -1,115 +0,0 @@
name: Test Postgres client libraries
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '23 02 * * *' # run once a day, timezone is utc
pull_request:
paths:
- '.github/workflows/pg-clients.yml'
- 'test_runner/pg_clients/**'
- 'poetry.lock'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
defaults:
run:
shell: bash -euxo pipefail {0}
env:
DEFAULT_PG_VERSION: 16
PLATFORM: neon-captest-new
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
AWS_DEFAULT_REGION: eu-central-1
jobs:
check-permissions:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name }}
check-build-tools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
needs: [ check-build-tools-image ]
uses: ./.github/workflows/build-build-tools-image.yml
with:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
test-postgres-client-libs:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init --user root
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run tests
uses: ./.github/actions/run-python-test-set
with:
build_type: remote
test_selection: pg_clients
run_in_parallel: false
extra_params: -m remote_cluster
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
- name: Delete Neon Project
if: always()
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- name: Post to a Slack channel
if: github.event.schedule && failure()
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

98
.github/workflows/pg_clients.yml vendored Normal file
View File

@@ -0,0 +1,98 @@
name: Test Postgres client libraries
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '23 02 * * *' # run once a day, timezone is utc
workflow_dispatch:
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
cancel-in-progress: true
jobs:
test-postgres-client-libs:
# TODO: switch to gen2 runner, requires docker
runs-on: ubuntu-22.04
env:
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Cache poetry deps
uses: actions/cache@v4
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}
run: ./scripts/pysync
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run pytest
env:
REMOTE_ENV: 1
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
shell: bash -euxo pipefail {0}
run: |
# Test framework expects we have psql binary;
# but since we don't really need it in this test, let's mock it
mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
--verbose \
-m "remote_cluster" \
-rA "test_runner/pg_clients"
- name: Delete Neon Project
if: ${{ always() }}
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
# It will be fixed after switching to gen2 runner
- name: Upload python test logs
if: always()
uses: actions/upload-artifact@v4
with:
retention-days: 7
name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs
path: ${{ env.TEST_OUTPUT }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -6,11 +6,6 @@ on:
- ready_for_review
workflow_call:
workflow_run:
workflows: ["Build and Test"]
types:
- completed
defaults:
run:
shell: bash -euxo pipefail {0}

71
Cargo.lock generated
View File

@@ -1397,9 +1397,9 @@ dependencies = [
[[package]]
name = "crc32c"
version = "0.6.8"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
dependencies = [
"rustc_version",
]
@@ -1651,16 +1651,6 @@ dependencies = [
"rusticata-macros",
]
[[package]]
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
"serde",
]
[[package]]
name = "desim"
version = "0.1.0"
@@ -3018,9 +3008,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "measured"
version = "0.0.22"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
dependencies = [
"bytes",
"crossbeam-utils",
@@ -3036,9 +3026,9 @@ dependencies = [
[[package]]
name = "measured-derive"
version = "0.0.22"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
dependencies = [
"heck 0.5.0",
"proc-macro2",
@@ -3048,9 +3038,9 @@ dependencies = [
[[package]]
name = "measured-process"
version = "0.0.22"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
dependencies = [
"libc",
"measured",
@@ -3285,12 +3275,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -3683,7 +3667,6 @@ dependencies = [
"sysinfo",
"tenant_size_model",
"thiserror",
"tikv-jemallocator",
"tokio",
"tokio-epoll-uring",
"tokio-io-timeout",
@@ -4094,7 +4077,6 @@ dependencies = [
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-util",
"tracing",
"workspace_hack",
]
@@ -4135,12 +4117,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@@ -5420,9 +5396,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
[[package]]
name = "serde"
version = "1.0.203"
version = "1.0.183"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
dependencies = [
"serde_derive",
]
@@ -5439,9 +5415,9 @@ dependencies = [
[[package]]
name = "serde_derive"
version = "1.0.203"
version = "1.0.183"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
dependencies = [
"proc-macro2",
"quote",
@@ -6131,15 +6107,12 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.36"
version = "0.3.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
dependencies = [
"deranged",
"itoa",
"js-sys",
"num-conv",
"powerfmt",
"serde",
"time-core",
"time-macros",
@@ -6147,17 +6120,16 @@ dependencies = [
[[package]]
name = "time-core"
version = "0.1.2"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
[[package]]
name = "time-macros"
version = "0.2.18"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
dependencies = [
"num-conv",
"time-core",
]
@@ -6839,7 +6811,6 @@ dependencies = [
"tokio-stream",
"tokio-tar",
"tokio-util",
"toml_edit 0.19.10",
"tracing",
"tracing-error",
"tracing-subscriber",
@@ -7455,12 +7426,13 @@ dependencies = [
"clap",
"clap_builder",
"crossbeam-utils",
"deranged",
"either",
"fail",
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hashbrown 0.14.5",
@@ -7478,9 +7450,7 @@ dependencies = [
"num-traits",
"once_cell",
"parquet",
"proc-macro2",
"prost",
"quote",
"rand 0.8.5",
"regex",
"regex-automata 0.4.3",
@@ -7497,7 +7467,6 @@ dependencies = [
"syn 1.0.109",
"syn 2.0.52",
"sync_wrapper",
"tikv-jemalloc-sys",
"time",
"time-macros",
"tokio",

View File

@@ -111,8 +111,8 @@ lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
measured = { version = "0.0.21", features=["lasso"] }
measured-process = { version = "0.0.21" }
memoffset = "0.8"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"

View File

@@ -42,13 +42,12 @@ ARG CACHEPOT_BUCKET=neon-github-dev
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \
@@ -57,7 +56,6 @@ RUN set -e \
--bin storage_controller \
--bin proxy \
--bin neon_local \
--bin storage_scrubber \
--locked --release \
&& cachepot -s
@@ -84,7 +82,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

View File

@@ -1,13 +1,5 @@
FROM debian:bullseye-slim
# Use ARG as a build-time environment variable here to allow.
# It's not supposed to be set outside.
# Alternatively it can be obtained using the following command
# ```
# . /etc/os-release && echo "${VERSION_CODENAME}"
# ```
ARG DEBIAN_VERSION_CODENAME=bullseye
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
@@ -34,6 +26,7 @@ RUN set -e \
liblzma-dev \
libncurses5-dev \
libncursesw5-dev \
libpq-dev \
libreadline-dev \
libseccomp-dev \
libsqlite3-dev \
@@ -74,24 +67,12 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
# LLVM
ENV LLVM_VERSION=18
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
&& apt update \
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install docker
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
&& apt update \
&& apt install -y docker-ce docker-ce-cli \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Configure sudo & docker
RUN usermod -aG sudo nonroot && \
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
usermod -aG docker nonroot
# AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
&& unzip -q awscliv2.zip \

View File

@@ -798,11 +798,7 @@ impl ComputeNode {
// In this case we need to connect with old `zenith_admin` name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let mut connstr = self.connstr.clone();
connstr
.query_pairs_mut()
.append_pair("application_name", "apply_config");
let connstr = self.connstr.clone();
let mut client = match Client::connect(connstr.as_str(), NoTls) {
Err(e) => match e.code() {
Some(&SqlState::INVALID_PASSWORD)
@@ -871,19 +867,15 @@ impl ComputeNode {
// Run migrations separately to not hold up cold starts
thread::spawn(move || {
let mut connstr = connstr.clone();
connstr
.query_pairs_mut()
.append_pair("application_name", "migrations");
let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client).context("apply_config handle_migrations")
});
Ok(())
}
// Wrapped this around `pg_ctl reload`, but right now we don't use
// `pg_ctl` for start / stop.
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
// have opened connection to Postgres and superuser access.
#[instrument(skip_all)]
fn pg_reload_conf(&self) -> Result<()> {
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
@@ -1395,9 +1387,7 @@ pub fn forward_termination_signal() {
let pg_pid = PG_PID.load(Ordering::SeqCst);
if pg_pid != 0 {
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
// Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
// ROs to get a list of running xacts faster instead of going through the CLOG.
// See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
kill(pg_pid, Signal::SIGINT).ok();
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
kill(pg_pid, Signal::SIGQUIT).ok();
}
}

View File

@@ -11,7 +11,6 @@ pub mod logger;
pub mod catalog;
pub mod compute;
pub mod extension_server;
mod migration;
pub mod monitor;
pub mod params;
pub mod pg_helpers;

View File

@@ -1,100 +0,0 @@
use anyhow::{Context, Result};
use postgres::Client;
use tracing::info;
pub(crate) struct MigrationRunner<'m> {
client: &'m mut Client,
migrations: &'m [&'m str],
}
impl<'m> MigrationRunner<'m> {
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
Self { client, migrations }
}
fn get_migration_id(&mut self) -> Result<i64> {
let query = "SELECT id FROM neon_migration.migration_id";
let row = self
.client
.query_one(query, &[])
.context("run_migrations get migration_id")?;
Ok(row.get::<&str, i64>("id"))
}
fn update_migration_id(&mut self) -> Result<()> {
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
self.migrations.len()
);
self.client
.simple_query(&setval)
.context("run_migrations update id")?;
Ok(())
}
fn prepare_migrations(&mut self) -> Result<()> {
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
self.client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
self.client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
self.client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
self.client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
self.client.simple_query(query)?;
Ok(())
}
pub fn run_migrations(mut self) -> Result<()> {
self.prepare_migrations()?;
let mut current_migration: usize = self.get_migration_id()? as usize;
let starting_migration_id = current_migration;
let query = "BEGIN";
self.client
.simple_query(query)
.context("run_migrations begin")?;
while current_migration < self.migrations.len() {
let migration = self.migrations[current_migration];
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", current_migration);
} else {
info!(
"Running migration id={}:\n{}\n",
current_migration, migration
);
self.client.simple_query(migration).with_context(|| {
format!("run_migration current_migration={}", current_migration)
})?;
}
current_migration += 1;
}
self.update_migration_id()?;
let query = "COMMIT";
self.client
.simple_query(query)
.context("run_migrations commit")?;
info!(
"Ran {} migrations",
(self.migrations.len() - starting_migration_id)
);
Ok(())
}
}

View File

@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
/// - next line starts with timestamp
/// - EOF
/// - no new lines were written for the last 100 milliseconds
/// - no new lines were written for the last second
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
let mut lines = tokio::io::BufReader::new(stderr).lines();
let timeout_duration = Duration::from_millis(100);

View File

@@ -10,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
use crate::config;
use crate::logger::inlinify;
use crate::migration::MigrationRunner;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
@@ -792,7 +791,69 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
];
MigrationRunner::new(client, &migrations).run_migrations()?;
let mut func = || {
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?;
Ok::<_, anyhow::Error>(())
};
func().context("handle_migrations prepare")?;
let query = "SELECT id FROM neon_migration.migration_id";
let row = client
.query_one(query, &[])
.context("handle_migrations get migration_id")?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration;
let query = "BEGIN";
client
.simple_query(query)
.context("handle_migrations begin")?;
while current_migration < migrations.len() {
let migration = &migrations[current_migration];
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", current_migration);
} else {
info!(
"Running migration id={}:\n{}\n",
current_migration, migration
);
client.simple_query(migration).with_context(|| {
format!("handle_migrations current_migration={}", current_migration)
})?;
}
current_migration += 1;
}
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
migrations.len()
);
client
.simple_query(&setval)
.context("handle_migrations update id")?;
let query = "COMMIT";
client
.simple_query(query)
.context("handle_migrations commit")?;
info!(
"Ran {} migrations",
(migrations.len() - starting_migration_id)
);
Ok(())
}

View File

@@ -325,16 +325,11 @@ impl LocalEnv {
}
}
pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
}
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
self.pg_dir(pg_version, "bin")
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
}
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
self.pg_dir(pg_version, "lib")
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
}
pub fn pageserver_bin(&self) -> PathBuf {

View File

@@ -15,6 +15,7 @@ use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
};
@@ -565,39 +566,60 @@ impl PageServerNode {
pg_wal: Option<(Lsn, PathBuf)>,
pg_version: u32,
) -> anyhow::Result<()> {
let (client, conn) = self.page_server_psql_client().await?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
let client = std::pin::pin!(client);
// Init base reader
let (start_lsn, base_tarfile_path) = base;
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
let base_tarfile =
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
// Init wal reader if necessary
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
let wal_reader =
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
(end_lsn, Some(wal_reader))
} else {
(start_lsn, None)
};
// Import base
self.http_client
.import_basebackup(
tenant_id,
timeline_id,
start_lsn,
end_lsn,
pg_version,
base_tarfile,
)
.await?;
let copy_in = |reader, cmd| {
let client = &client;
async move {
let writer = client.copy_in(&cmd).await?;
let writer = std::pin::pin!(writer);
let mut writer = writer.sink_map_err(|e| {
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
});
let mut reader = std::pin::pin!(reader);
writer.send_all(&mut reader).await?;
writer.into_inner().finish().await?;
anyhow::Ok(())
}
};
// Import base
copy_in(
base_tarfile,
format!(
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
),
)
.await?;
// Import wal if necessary
if let Some(wal_reader) = wal_reader {
self.http_client
.import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
.await?;
copy_in(
wal_reader,
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
)
.await?;
}
Ok(())

View File

@@ -155,16 +155,16 @@ impl StorageController {
.expect("non-Unicode path")
}
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
///
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
/// to other versions if that one isn't found. Some automated tests create circumstances
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
for v in prefer_versions {
let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
if tokio::fs::try_exists(&path).await? {
return Ok(path);
}
@@ -172,20 +172,11 @@ impl StorageController {
// Fall through
anyhow::bail!(
"Postgres directory '{}' not found in {}",
dir_name,
self.env.pg_distrib_dir.display(),
"Postgres binaries not found in {}",
self.env.pg_distrib_dir.display()
);
}
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
self.get_pg_dir("bin").await
}
pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
self.get_pg_dir("lib").await
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
@@ -238,17 +229,12 @@ impl StorageController {
.unwrap()
.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_lib_dir = self.get_pg_lib_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.envs(vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
@@ -283,10 +269,7 @@ impl StorageController {
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
[],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
retry_timeout,
|| self.pg_isready(&pg_bin_dir),
@@ -341,10 +324,7 @@ impl StorageController {
&self.env.base_data_dir,
&self.env.storage_controller_bin(),
args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
[],
background_process::InitialPidFile::Create(self.pid_file()),
retry_timeout,
|| async {

View File

@@ -1,345 +0,0 @@
# Graceful Restarts of Storage Controller Managed Clusters
## Summary
This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
graceful cluster restarts.
## Motivation
Pageserver restarts cause read availablity downtime for tenants.
For example pageserver-3 @ us-east-1 was unavailable for a randomly
picked tenant (which requested on-demand activation) for around 30 seconds
during the restart at 2024-04-03 16:37 UTC.
Note that lots of shutdowns on loaded pageservers do not finish within the
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
This problem is not yet very acutely felt in storage controller managed pageservers since
tenant density is much lower there. However, we are planning on eventually migrating all
pageservers to storage controller management, so it makes sense to solve the issue proactively.
## Requirements
- Pageserver re-deployments cause minimal downtime for tenants
- The storage controller exposes HTTP API hooks for draining and filling tenant shards
from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
- The storage controller exposes some HTTP API to cancel draining and filling background operations.
- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
as usual (with downtime).
- Progress of draining/filling is visible through metrics
## Non Goals
- Integration with the control plane
- Graceful restarts for large non-HA tenants.
## Impacted Components
- storage controller
- deployment orchestrator (i.e. Ansible)
- pageserver (indirectly)
## Terminology
** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
are distributed across the rest of the cluster.
** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
node is set in the `Paused` policy, no further shards will be scheduled on it.
** Node ** is a pageserver. Term is used interchangeably in this RFC.
** Deployment orchestrator ** is a generic term for whatever drives our deployments.
Currently, it's an Ansible playbook.
## Background
### Storage Controller Basics (skip if already familiar)
Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
### Background Optimizations
The storage controller performs scheduling optimizations in the background. It will
migrate attachments to warm secondaries and replace secondaries in order to balance
the cluster out.
### Reconciliations Concurrency Limiting
There's a hard limit on the number of reconciles that the storage controller
can have in flight at any given time. To get an idea of scales, the limit is
128 at the time of writing.
## Implementation
Note: this section focuses on the core functionality of the graceful restart process.
It doesn't neccesarily describe the most efficient approach. Optimizations are described
separately in a later section.
### Overall Flow
This section describes how to implement graceful restarts from the perspective
of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
The orchestrator shall implement the following epilogue and prologue steps for each
pageserver restart:
#### Prologue
The orchestrator shall first fetch the pageserver node id from the control plane or
the pageserver it aims to restart directly. Next, it issues an HTTP request
to the storage controller in order to start the drain of said pageserver node.
All error responses are retried with a short back-off. When a 202 (Accepted)
HTTP code is returned, the drain has started. Now the orchestrator polls the
node status endpoint exposed by the storage controller in order to await the
end of the drain process. When the `policy` field of the node status response
becomes `PauseForRestart`, the drain has completed and the orchestrator can
proceed with restarting the pageserver.
The prologue is subject to an overall timeout. It will have a value in the ballpark
of minutes. As storage controller managed pageservers become more loaded this timeout
will likely have to increase.
#### Epilogue
After restarting the pageserver, the orchestrator issues an HTTP request
to the storage controller to kick off the filling process. This API call
may be retried for all error codes with a short backoff. This also serves
as a synchronization primitive as the fill will be refused if the pageserver
has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
code is returned, the fill has started. Now the orchestrator polls the node
status endpoint exposed by the storage controller in order to await the end of
the filling process. When the `policy` field of the node status response becomes
`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
Again, the epilogue is subject to an overall timeout. We can start off with
using the same timeout as for the prologue, but can also consider relying on
the storage controller's background optimizations with a shorter timeout.
In the case that the deployment orchestrator times out, it attempts to cancel
the fill. This operation shall be retried with a short back-off. If it ultimately
fails it will require manual intervention to set the nodes scheduling policy to
`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
but it constrains the scheduler as mentioned previously.
### Node Scheduling Policy State Machine
The state machine below encodes the behaviours discussed above and
the various failover situations described in a later section.
Assuming no failures and/or timeouts the flow should be:
`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
```
Operator requested drain
+-----------------------------------------+
| |
+-------+-------+ +-------v-------+
| | | |
| Pause | +-----------> Draining +----------+
| | | | | |
+---------------+ | +-------+-------+ |
| | |
| | |
Drain requested| | |
| |Drain complete | Drain failed
| | | Cancelled/PS reattach/Storcon restart
| | |
+-------+-------+ | |
| | | |
+-------------+ Active <-----------+------------------+
| | | |
Fill requested | +---^---^-------+ |
| | | |
| | | |
| | | |
| Fill completed| | |
| | |PS reattach |
| | |after restart |
+-------v-------+ | | +-------v-------+
| | | | | |
| Filling +---------+ +-----------+PauseForRestart|
| | | |
+---------------+ +---------------+
```
### Draining/Filling APIs
The storage controller API to trigger the draining of a given node is:
`PUT /v1/control/node/:node_id/{drain,fill}`.
The following HTTP non-success return codes are used.
All of them are safely retriable from the perspective of the storage controller.
- 404: Requested node was not found
- 503: Requested node is known to the storage controller, but unavailable
- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
- 409: A {drain, fill} is already in progress. Only one such background operation
is allowed per node.
When the drain is accepted and commenced a 202 HTTP code is returned.
Drains and fills shall be cancellable by the deployment orchestrator or a
human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
response is returned when the cancelation is successful. Errors are retriable.
### Drain Process
Before accpeting a drain request the following validations is applied:
* Ensure that the node is known the storage controller
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
* Ensure that another drain or fill is not already running on the node
* Ensure that a drain is possible (i.e. check that there is at least one
schedulable node to drain to)
After accepting the drain, the scheduling policy of the node is set to
`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
This disallows the optimizer from adding or removing shards from the node which
is desirable to avoid them racing.
Next, a separate Tokio task is spawned to manage the draining. For each tenant
shard attached to the node being drained, demote the node to a secondary and
attempt to schedule the node away. Scheduling might fail due to unsatisfiable
constraints, but that is fine. Draining is a best effort process since it might
not always be possible to cut over all shards.
Importantly, this task manages the concurrency of issued reconciles in order to
avoid drowning out the target pageservers and to allow other important reconciles
to proceed.
Once the triggered reconciles have finished or timed out, set the node's scheduling
policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
A note on non HA tenants: These tenants do not have secondaries, so by the description
above, they would not be migrated. It makes sense to skip them (especially the large ones)
since, depending on tenant size, this might be more disruptive than the restart since the
pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
We can consider expanding to small non-HA tenants in the future.
### Fill Process
Before accpeting a fill request the following validations is applied:
* Ensure that the node is known the storage controller
* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
This is the only acceptable policy for the fill starting state. When a node re-attaches,
it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
* Ensure that another drain or fill is not already running on the node
After accepting the drain, the scheduling policy of the node is set to
`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
This disallows the optimizer from adding or removing shards from the node which
is desirable to avoid them racing.
Next, a separate Tokio task is spawned to manage the draining. For each tenant
shard where the filled node is a secondary, promote the secondary. This is done
until we run out of shards or the counts of attached shards become balanced across
the cluster.
Like for draining, the concurrency of spawned reconciles is limited.
### Failure Modes & Handling
Failures are generally handled by transition back into the `Active`
(neutral) state. This simplifies the implementation greatly at the
cost of adding transitions to the state machine. For example, we
could detect the `Draining` state upon restart and proceed with a drain,
but how should the storage controller know that's what the orchestrator
needs still?
#### Storage Controller Crash
When the storage controller starts up reset the node scheduling policy
of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
`Active`. The rationale is that when the storage controller restarts,
we have lost context of what the deployment orchestrator wants. It also
has the benefit of making things easier to reason about.
#### Pageserver Crash During Drain
The pageserver will attempt to re-attach during restart at which
point the node scheduling policy will be set back to `Active`, thus
reenabling the scheduler to use the node.
#### Non-drained Pageserver Crash During Drain
What should happen when a pageserver we are draining to crashes during the
process. Two reasonable options are: cancel the drain and focus on the failover
*or* do both, but prioritise failover. Since the number of concurrent reconciles
produced by drains/fills are limited, we get the later behaviour for free.
My suggestion is we take this approach, but the cancellation option is trivial
to implement as well.
#### Pageserver Crash During Fill
The pageserver will attempt to re-attach during restart at which
point the node scheduling policy will be set back to `Active`, thus
reenabling the scheduler to use the node.
#### Pageserver Goes unavailable During Drain/Fill
The drain and fill jobs handle this by stopping early. When the pageserver
is detected as online by storage controller heartbeats, reset its scheduling
policy to `Active`. If a restart happens instead, see the pageserver crash
failure mode.
#### Orchestrator Drain Times Out
Orchestrator will still proceed with the restart.
When the pageserver re-attaches, the scheduling policy is set back to
`Active`.
#### Orchestrator Fill Times Out
Orchestrator will attempt to cancel the fill operation. If that fails,
the fill will continue until it quiesces and the node will be left
in the `Filling` scheduling policy. This hinders the scheduler, but is
otherwise harmless. A human operator can handle this by setting the scheduling
policy to `Active`, or we can bake in a fill timeout into the storage controller.
## Optimizations
### Location Warmth
When cutting over to a secondary, the storage controller will wait for it to
become "warm" (i.e. download enough of the tenants data). This means that some
reconciliations can take significantly longer than others and hold up precious
reconciliations units. As an optimization, the drain stage can only cut over
tenants that are already "warm". Similarly, the fill stage can prioritise the
"warmest" tenants in the fill.
Given that the number of tenants by the storage controller will be fairly low
for the foreseable future, the first implementation could simply query the tenants
for secondary status. This doesn't scale well with increasing tenant counts, so
eventually we will need new pageserver API endpoints to report the sets of
"warm" and "cold" nodes.
## Alternatives Considered
### Draining and Filling Purely as Scheduling Constraints
At its core, the storage controller is a big background loop that detects changes
in the environment and reacts on them. One could express draining and filling
of nodes purely in terms of constraining the scheduler (as opposed to having
such background tasks).
While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
It would also mean that reconciliations themselves have side effects that persist in the database
(persist something to the databse when the drain is done), which I'm not conceptually fond of.
## Proof of Concept
This RFC is accompanied by a POC which implements nearly everything mentioned here
apart from the optimizations and some of the failure handling:
https://github.com/neondatabase/neon/pull/7682

View File

@@ -13,7 +13,11 @@ use std::{
use measured::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
metric::{
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
};
@@ -140,7 +144,6 @@ impl<const N: usize> HyperLogLogState<N> {
})
}
}
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N>
{
@@ -179,13 +182,12 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
CounterState::new(val as u64).collect_into(
&(),
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
name.by_ref(),
enc,
MetricValue::Int(val as i64),
)
})
}

View File

@@ -9,7 +9,7 @@ use measured::{
metric::{
counter::CounterState,
gauge::GaugeState,
group::Encoding,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
@@ -171,11 +171,8 @@ fn write_gauge<Enc: Encoding>(
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err>
where
GaugeState: MetricEncoding<Enc>,
{
GaugeState::new(x).collect_into(&(), labels, name, enc)
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
@@ -547,6 +544,15 @@ impl<T: Encoding> Encoding for Inc<T> {
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -573,6 +579,15 @@ impl<T: Encoding> Encoding for Dec<T> {
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder

View File

@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;

View File

@@ -17,16 +17,6 @@ pub struct KeySpace {
pub ranges: Vec<Range<Key>>,
}
impl std::fmt::Display for KeySpace {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[")?;
for range in &self.ranges {
write!(f, "{}..{},", range.start, range.end)?;
}
write!(f, "]")
}
}
/// A wrapper type for sparse keyspaces.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct SparseKeySpace(pub KeySpace);

View File

@@ -9,7 +9,6 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
};
@@ -229,11 +228,6 @@ pub struct TimelineCreateRequest {
pub pg_version: Option<u32>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct LsnLeaseRequest {
pub lsn: Lsn,
}
#[derive(Serialize, Deserialize)]
pub struct TenantShardSplitRequest {
pub new_shard_count: u8,
@@ -438,51 +432,6 @@ pub enum CompactionAlgorithm {
Tiered,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ImageCompressionAlgorithm {
/// Disabled for writes, and never decompress during reading.
/// Never set this after you've enabled compression once!
DisabledNoDecompress,
// Disabled for writes, support decompressing during read path
Disabled,
/// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
/// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
Zstd {
level: Option<i8>,
},
}
impl ImageCompressionAlgorithm {
pub fn allow_decompression(&self) -> bool {
!matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress)
}
}
impl FromStr for ImageCompressionAlgorithm {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut components = s.split(['(', ')']);
let first = components
.next()
.ok_or_else(|| anyhow::anyhow!("empty string"))?;
match first {
"disabled-no-decompress" => Ok(ImageCompressionAlgorithm::DisabledNoDecompress),
"disabled" => Ok(ImageCompressionAlgorithm::Disabled),
"zstd" => {
let level = if let Some(v) = components.next() {
let v: i8 = v.parse()?;
Some(v)
} else {
None
};
Ok(ImageCompressionAlgorithm::Zstd { level })
}
_ => anyhow::bail!("invalid specifier '{first}'"),
}
}
}
#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
pub struct CompactionAlgorithmSettings {
pub kind: CompactionAlgorithm,
@@ -694,16 +643,6 @@ pub struct TimelineInfo {
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub current_logical_size_non_incremental: Option<u64>,
/// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
/// beyond the branch's branch point, we only count up to the branch point.
pub pitr_history_size: u64,
/// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
/// ancestor data used by this branch would have been retained anyway). If this is false, then
/// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
/// otherwise be able to GC.
pub within_ancestor_pitr: bool,
pub timeline_dir_layer_file_size_sum: Option<u64>,
pub wal_source_connstr: Option<String>,
@@ -1675,29 +1614,4 @@ mod tests {
AuxFilePolicy::CrossValidation
);
}
#[test]
fn test_image_compression_algorithm_parsing() {
use ImageCompressionAlgorithm::*;
assert_eq!(
ImageCompressionAlgorithm::from_str("disabled").unwrap(),
Disabled
);
assert_eq!(
ImageCompressionAlgorithm::from_str("disabled-no-decompress").unwrap(),
DisabledNoDecompress
);
assert_eq!(
ImageCompressionAlgorithm::from_str("zstd").unwrap(),
Zstd { level: None }
);
assert_eq!(
ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
Zstd { level: Some(18) }
);
assert_eq!(
ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
Zstd { level: Some(-3) }
);
}
}

View File

@@ -1,42 +1,59 @@
//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
//!
//! This module contains a variety of types used to represent the concept of sharding
//! a Neon tenant across multiple physical shards. Since there are quite a few of these,
//! we provide an summary here.
//!
//! Types used to describe shards:
//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
//! which identifies a tenant which is not shard-aware. This means its storage paths do not include
//! a shard suffix.
//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
//! without the tenant ID. This is useful for things that are implicitly scoped to a particular
//! tenant, such as layer files.
//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
//! four hex digits. An unsharded tenant is `0000`.
//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
//!
//! Types used to describe the parameters for data distribution in a sharded tenant:
//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
//! multiple shards. Its value is given in 8kiB pages.
//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
//! always zero: this is provided for future upgrades that might introduce different
//! data distribution schemes.
//!
//! Examples:
//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
//! and their slugs are 0004, 0104, 0204, and 0304.
use std::{ops::RangeInclusive, str::FromStr};
use crate::{key::Key, models::ShardParameters};
use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
use utils::id::TenantId;
#[doc(inline)]
pub use ::utils::shard::*;
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
///
/// This module contains a variety of types used to represent the concept of sharding
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
/// we provide an summary here.
///
/// Types used to describe shards:
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
/// a shard suffix.
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
/// tenant, such as layer files.
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
/// four hex digits. An unsharded tenant is `0000`.
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
///
/// Types used to describe the parameters for data distribution in a sharded tenant:
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
/// multiple shards. Its value is given in 8kiB pages.
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
/// always zero: this is provided for future upgrades that might introduce different
/// data distribution schemes.
///
/// Examples:
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
/// and their slugs are 0004, 0104, 0204, and 0304.
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
/// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -48,6 +65,362 @@ pub struct ShardIdentity {
layout: ShardLayout,
}
/// Formatting helper, for generating the `shard_id` label in traces.
struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as [`TenantShardId::unsharded`].
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
pub fn count(&self) -> u8 {
if self.0 > 0 {
self.0
} else {
1
}
}
/// The literal internal value: this is **not** the number of shards in the
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
/// [`Self::count`] if you want to know the cardinality of shards.
pub fn literal(&self) -> u8 {
self.0
}
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
/// uses the legacy format for `TenantShardId`. See also the documentation for
/// [`Self::count`].
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub const fn new(val: u8) -> Self {
Self(val)
}
}
impl ShardNumber {
pub const MAX: Self = Self(u8::MAX);
}
impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self {
Self {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
RangeInclusive::new(
Self {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
},
Self {
tenant_id,
shard_number: ShardNumber::MAX,
shard_count: ShardCount::MAX,
},
)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
/// is useful when logging from code that is already in a span that includes tenant ID, to
/// keep messages reasonably terse.
pub fn to_index(&self) -> ShardIndex {
ShardIndex {
shard_number: self.shard_number,
shard_count: self.shard_count,
}
}
/// Calculate the children of this TenantShardId when splitting the overall tenant into
/// the given number of shards.
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
let mut child_shards = Vec::new();
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
// Key mapping is based on a round robin mapping of key hash modulo shard count,
// so our child shards are the ones which the same keys would map to.
if shard_number % effective_old_shard_count == self.shard_number.0 {
child_shards.push(TenantShardId {
tenant_id: self.tenant_id,
shard_number: ShardNumber(shard_number),
shard_count: new_shard_count,
})
}
}
child_shards
}
}
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02x}{:02x}",
self.0.shard_number.0, self.0.shard_count.0
)
}
}
impl std::fmt::Display for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.shard_count != ShardCount(0) {
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
} else {
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
// is distinct from the normal single shard case (shard count == 1).
self.tenant_id.fmt(f)
}
}
}
impl std::fmt::Debug for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for TenantShardId {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
if s.len() == 32 {
// Legacy case: no shard specified
Ok(Self {
tenant_id: TenantId::from_str(s)?,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
})
} else if s.len() == 37 {
let bytes = s.as_bytes();
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
Ok(Self {
tenant_id,
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 18]> for TenantShardId {
fn from(b: [u8; 18]) -> Self {
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
Self {
tenant_id: TenantId::from(tenant_id_bytes),
shard_number: ShardNumber(b[16]),
shard_count: ShardCount(b[17]),
}
}
}
impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
shard_number: number,
shard_count: count,
}
}
pub fn unsharded() -> Self {
Self {
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// For use in constructing remote storage paths: concatenate this with a TenantId
/// to get a fully qualified TenantShardId.
///
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
/// that the legacy pre-sharding remote key format is preserved.
pub fn get_suffix(&self) -> String {
if self.is_unsharded() {
"".to_string()
} else {
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
}
impl std::fmt::Display for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Debug for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for ShardIndex {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 1 byte shard number, 1 byte shard count
if s.len() == 4 {
let bytes = s.as_bytes();
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(bytes, &mut shard_parts)?;
Ok(Self {
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 2]> for ShardIndex {
fn from(b: [u8; 2]) -> Self {
Self {
shard_number: ShardNumber(b[0]),
shard_count: ShardCount(b[1]),
}
}
}
impl Serialize for TenantShardId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0;
packed[17] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for TenantShardId {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = TenantShardId;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 18])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 18] = Deserialize::deserialize(s)?;
Ok(TenantShardId::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
TenantShardId::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
18,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
/// Stripe size in number of pages
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
@@ -212,6 +585,77 @@ impl ShardIdentity {
}
}
impl Serialize for ShardIndex {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Binary encoding is not used in index_part.json, but is included in anticipation of
// switching various structures (e.g. inter-process communication, remote metadata) to more
// compact binary encodings in future.
let mut packed: [u8; 2] = [0; 2];
packed[0] = self.shard_number.0;
packed[1] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for ShardIndex {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = ShardIndex;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 2])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 2] = Deserialize::deserialize(s)?;
Ok(ShardIndex::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
ShardIndex::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
2,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
/// in order to be able to serve basebackup requests without peer communication).
fn key_is_shard0(key: &Key) -> bool {
@@ -293,9 +737,7 @@ pub fn describe(
#[cfg(test)]
mod tests {
use std::str::FromStr;
use utils::{id::TenantId, Hex};
use utils::Hex;
use super::*;

View File

@@ -13,7 +13,6 @@ rustls.workspace = true
serde.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
tokio-rustls.workspace = true
tracing.workspace = true
@@ -24,4 +23,4 @@ workspace_hack.workspace = true
once_cell.workspace = true
rustls-pemfile.workspace = true
tokio-postgres.workspace = true
tokio-postgres-rustls.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -16,7 +16,6 @@ use std::{fmt, io};
use std::{future::Future, str::FromStr};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, trace, warn};
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -401,15 +400,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
}
/// Wrapper for run_message_loop() that shuts down socket when we are done
pub async fn run(
pub async fn run<F, S>(
mut self,
handler: &mut impl Handler<IO>,
cancel: &CancellationToken,
) -> Result<(), QueryError> {
let ret = self.run_message_loop(handler, cancel).await;
shutdown_watcher: F,
) -> Result<(), QueryError>
where
F: Fn() -> S + Clone,
S: Future,
{
let ret = self
.run_message_loop(handler, shutdown_watcher.clone())
.await;
tokio::select! {
_ = cancel.cancelled() => {
_ = shutdown_watcher() => {
// do nothing; we most likely got already stopped by shutdown and will log it next.
}
_ = self.framed.shutdown() => {
@@ -439,17 +444,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
}
}
async fn run_message_loop(
async fn run_message_loop<F, S>(
&mut self,
handler: &mut impl Handler<IO>,
cancel: &CancellationToken,
) -> Result<(), QueryError> {
shutdown_watcher: F,
) -> Result<(), QueryError>
where
F: Fn() -> S,
S: Future,
{
trace!("postgres backend to {:?} started", self.peer_addr);
tokio::select!(
biased;
_ = cancel.cancelled() => {
_ = shutdown_watcher() => {
// We were requested to shut down.
tracing::info!("shutdown request received during handshake");
return Err(QueryError::Shutdown)
@@ -464,7 +473,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
let mut query_string = Bytes::new();
while let Some(msg) = tokio::select!(
biased;
_ = cancel.cancelled() => {
_ = shutdown_watcher() => {
// We were requested to shut down.
tracing::info!("shutdown request received in run_message_loop");
return Err(QueryError::Shutdown)
@@ -476,7 +485,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
let result = self.process_message(handler, msg, &mut query_string).await;
tokio::select!(
biased;
_ = cancel.cancelled() => {
_ = shutdown_watcher() => {
// We were requested to shut down.
tracing::info!("shutdown request received during response flush");
@@ -663,17 +672,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
assert!(self.state < ProtoState::Authentication);
let have_tls = self.tls_config.is_some();
match msg {
FeStartupPacket::SslRequest { direct } => {
FeStartupPacket::SslRequest => {
debug!("SSL requested");
if !direct {
self.write_message(&BeMessage::EncryptionResponse(have_tls))
.await?;
} else if !have_tls {
return Err(QueryError::Other(anyhow::anyhow!(
"direct SSL negotiation but no TLS support"
)));
}
self.write_message(&BeMessage::EncryptionResponse(have_tls))
.await?;
if have_tls {
self.start_tls().await?;

View File

@@ -3,14 +3,13 @@ use once_cell::sync::Lazy;
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
use pq_proto::{BeMessage, RowDescriptor};
use std::io::Cursor;
use std::sync::Arc;
use std::{future, sync::Arc};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::net::{TcpListener, TcpStream};
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::MakeTlsConnect;
use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
use tokio_postgres_rustls::MakeRustlsConnect;
use tokio_util::sync::CancellationToken;
// generate client, server test streams
async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -51,7 +50,7 @@ async fn simple_select() {
tokio::spawn(async move {
let mut handler = TestHandler {};
pgbackend.run(&mut handler, &CancellationToken::new()).await
pgbackend.run(&mut handler, future::pending::<()>).await
});
let conf = Config::new();
@@ -103,7 +102,7 @@ async fn simple_select_ssl() {
tokio::spawn(async move {
let mut handler = TestHandler {};
pgbackend.run(&mut handler, &CancellationToken::new()).await
pgbackend.run(&mut handler, future::pending::<()>).await
});
let client_cfg = rustls::ClientConfig::builder()

View File

@@ -44,9 +44,9 @@ impl ConnectionError {
/// Wraps async io `stream`, providing messages to write/flush + read Postgres
/// messages.
pub struct Framed<S> {
pub stream: S,
pub read_buf: BytesMut,
pub write_buf: BytesMut,
stream: S,
read_buf: BytesMut,
write_buf: BytesMut,
}
impl<S> Framed<S> {

View File

@@ -39,39 +39,14 @@ pub enum FeMessage {
PasswordMessage(Bytes),
}
#[derive(Clone, Copy, PartialEq, PartialOrd)]
pub struct ProtocolVersion(u32);
impl ProtocolVersion {
pub const fn new(major: u16, minor: u16) -> Self {
Self((major as u32) << 16 | minor as u32)
}
pub const fn minor(self) -> u16 {
self.0 as u16
}
pub const fn major(self) -> u16 {
(self.0 >> 16) as u16
}
}
impl fmt::Debug for ProtocolVersion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_list()
.entry(&self.major())
.entry(&self.minor())
.finish()
}
}
#[derive(Debug)]
pub enum FeStartupPacket {
CancelRequest(CancelKeyData),
SslRequest {
direct: bool,
},
SslRequest,
GssEncRequest,
StartupMessage {
version: ProtocolVersion,
major_version: u32,
minor_version: u32,
params: StartupMessageParams,
},
}
@@ -326,23 +301,11 @@ impl FeStartupPacket {
/// different from [`FeMessage::parse`] because startup messages don't have
/// message type byte; otherwise, its comments apply.
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
// <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
// First byte indicates standard SSL handshake message
// (It can't be a Postgres startup length because in network byte order
// that would be a startup packet hundreds of megabytes long)
if buf.first() == Some(&0x16) {
return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
}
const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
const CANCEL_REQUEST_CODE: u32 = 5678;
const NEGOTIATE_SSL_CODE: u32 = 5679;
const NEGOTIATE_GSS_CODE: u32 = 5680;
// need at least 4 bytes with packet len
if buf.len() < 4 {
@@ -375,10 +338,12 @@ impl FeStartupPacket {
let mut msg = buf.split_to(len).freeze();
msg.advance(4); // consume len
let request_code = ProtocolVersion(msg.get_u32());
let request_code = msg.get_u32();
let req_hi = request_code >> 16;
let req_lo = request_code & ((1 << 16) - 1);
// StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
let message = match request_code {
CANCEL_REQUEST_CODE => {
let message = match (req_hi, req_lo) {
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
if msg.remaining() != 8 {
return Err(ProtocolError::BadMessage(
"CancelRequest message is malformed, backend PID / secret key missing"
@@ -390,22 +355,21 @@ impl FeStartupPacket {
cancel_key: msg.get_i32(),
})
}
NEGOTIATE_SSL_CODE => {
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
// Requested upgrade to SSL (aka TLS)
FeStartupPacket::SslRequest { direct: false }
FeStartupPacket::SslRequest
}
NEGOTIATE_GSS_CODE => {
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
// Requested upgrade to GSSAPI
FeStartupPacket::GssEncRequest
}
version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
return Err(ProtocolError::Protocol(format!(
"Unrecognized request code {}",
version.minor()
"Unrecognized request code {unrecognized_code}"
)));
}
// TODO bail if protocol major_version is not 3?
version => {
(major_version, minor_version) => {
// StartupMessage
let s = str::from_utf8(&msg).map_err(|_e| {
@@ -418,7 +382,8 @@ impl FeStartupPacket {
})?;
FeStartupPacket::StartupMessage {
version,
major_version,
minor_version,
params: StartupMessageParams {
params: msg.slice_ref(s.as_bytes()),
},
@@ -557,10 +522,6 @@ pub enum BeMessage<'a> {
RowDescription(&'a [RowDescriptor<'a>]),
XLogData(XLogDataBody<'a>),
NoticeResponse(&'a str),
NegotiateProtocolVersion {
version: ProtocolVersion,
options: &'a [&'a str],
},
KeepAlive(WalSndKeepAlive),
}
@@ -984,18 +945,6 @@ impl<'a> BeMessage<'a> {
buf.put_u8(u8::from(req.request_reply));
});
}
BeMessage::NegotiateProtocolVersion { version, options } => {
buf.put_u8(b'v');
write_body(buf, |buf| {
buf.put_u32(version.0);
buf.put_u32(options.len() as u32);
for option in options.iter() {
write_cstr(option, buf)?;
}
Ok(())
})?
}
}
Ok(())
}

View File

@@ -1,5 +1,6 @@
use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
use anyhow::bail;
use aws_sdk_s3::types::StorageClass;
use camino::Utf8PathBuf;
@@ -175,8 +176,20 @@ fn serialize_storage_class<S: serde::Serializer>(
impl RemoteStorageConfig {
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
Ok(utils::toml_edit_ext::deserialize_item(toml)?)
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
let document: toml_edit::Document = match toml {
toml_edit::Item::Table(toml) => toml.clone().into(),
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
toml.clone().into_table().into()
}
_ => bail!("toml not a table or inline table"),
};
if document.is_empty() {
return Ok(None);
}
Ok(Some(toml_edit::de::from_document(document)?))
}
}
@@ -184,7 +197,7 @@ impl RemoteStorageConfig {
mod tests {
use super::*;
fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
let toml = input.parse::<toml_edit::Document>().unwrap();
RemoteStorageConfig::from_toml(toml.as_item())
}
@@ -194,7 +207,7 @@ mod tests {
let input = "local_path = '.'
timeout = '5s'";
let config = parse(input).unwrap();
let config = parse(input).unwrap().expect("it exists");
assert_eq!(
config,
@@ -216,7 +229,7 @@ timeout = '5s'";
timeout = '7s'
";
let config = parse(toml).unwrap();
let config = parse(toml).unwrap().expect("it exists");
assert_eq!(
config,
@@ -244,7 +257,7 @@ timeout = '5s'";
timeout = '7s'
";
let config = parse(toml).unwrap();
let config = parse(toml).unwrap().expect("it exists");
assert_eq!(
config,

View File

@@ -34,10 +34,10 @@ struct SegmentSize {
}
struct SizeAlternatives {
/// cheapest alternative if parent is available.
// cheapest alternative if parent is available.
incremental: SegmentSize,
/// cheapest alternative if parent node is not available
// cheapest alternative if parent node is not available
non_incremental: Option<SegmentSize>,
}

View File

@@ -3,17 +3,10 @@ use std::fmt::Write;
const SVG_WIDTH: f32 = 500.0;
/// Different branch kind for SVG drawing.
#[derive(PartialEq)]
pub enum SvgBranchKind {
Timeline,
Lease,
}
struct SvgDraw<'a> {
storage: &'a StorageModel,
branches: &'a [String],
seg_to_branch: &'a [(usize, SvgBranchKind)],
seg_to_branch: &'a [usize],
sizes: &'a [SegmentSizeResult],
// layout
@@ -49,18 +42,13 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
"<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
)?;
writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
writeln!(
result,
"<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
)?;
writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
Ok(())
}
pub fn draw_svg(
storage: &StorageModel,
branches: &[String],
seg_to_branch: &[(usize, SvgBranchKind)],
seg_to_branch: &[usize],
sizes: &SizeResult,
) -> anyhow::Result<String> {
let mut draw = SvgDraw {
@@ -112,7 +100,7 @@ impl<'a> SvgDraw<'a> {
// Layout the timelines on Y dimension.
// TODO
let mut y = 120.0;
let mut y = 100.0;
let mut branch_y_coordinates = Vec::new();
for _branch in self.branches {
branch_y_coordinates.push(y);
@@ -121,7 +109,7 @@ impl<'a> SvgDraw<'a> {
// Calculate coordinates for each point
let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
.map(|(seg, (branch_id, _))| {
.map(|(seg, branch_id)| {
let x = (seg.lsn - min_lsn) as f32 / xscale;
let y = branch_y_coordinates[*branch_id];
(x, y)
@@ -187,22 +175,6 @@ impl<'a> SvgDraw<'a> {
// draw a snapshot point if it's needed
let (coord_x, coord_y) = self.seg_coordinates[seg_id];
let (_, kind) = &self.seg_to_branch[seg_id];
if kind == &SvgBranchKind::Lease {
let (x1, y1) = (coord_x, coord_y - 10.0);
let (x2, y2) = (coord_x, coord_y + 10.0);
let style = "stroke-width=\"3\" stroke=\"blue\"";
writeln!(
result,
"<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
)?;
writeln!(result, " <title>leased lsn at {}</title>", seg.lsn)?;
writeln!(result, "</line>")?;
}
if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
writeln!(
result,

View File

@@ -40,7 +40,6 @@ thiserror.workspace = true
tokio.workspace = true
tokio-tar.workspace = true
tokio-util.workspace = true
toml_edit.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }

View File

@@ -74,15 +74,6 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
.transpose()
}
pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
request: &Request<Body>,
param_name: &str,
) -> Result<T, ApiError> {
parse_query_param(request, param_name)?.ok_or_else(|| {
ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
})
}
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
match request.body_mut().data().await {
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),

View File

@@ -26,8 +26,6 @@ pub mod auth;
// utility functions and helper traits for unified unique id generation/serialization etc.
pub mod id;
pub mod shard;
mod hex;
pub use hex::Hex;
@@ -96,8 +94,6 @@ pub mod env;
pub mod poison;
pub mod toml_edit_ext;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -1,451 +0,0 @@
//! See `pageserver_api::shard` for description on sharding.
use std::{ops::RangeInclusive, str::FromStr};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use crate::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(pub u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// Formatting helper, for generating the `shard_id` label in traces.
pub struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as [`TenantShardId::unsharded`].
///
/// This method returns the actual number of shards, i.e. if our internal value is
/// zero, we return 1 (unsharded tenants have 1 shard).
pub fn count(&self) -> u8 {
if self.0 > 0 {
self.0
} else {
1
}
}
/// The literal internal value: this is **not** the number of shards in the
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
/// [`Self::count`] if you want to know the cardinality of shards.
pub fn literal(&self) -> u8 {
self.0
}
/// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
/// uses the legacy format for `TenantShardId`. See also the documentation for
/// [`Self::count`].
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub const fn new(val: u8) -> Self {
Self(val)
}
}
impl ShardNumber {
pub const MAX: Self = Self(u8::MAX);
}
impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self {
Self {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
RangeInclusive::new(
Self {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
},
Self {
tenant_id,
shard_number: ShardNumber::MAX,
shard_count: ShardCount::MAX,
},
)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
}
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
/// is useful when logging from code that is already in a span that includes tenant ID, to
/// keep messages reasonably terse.
pub fn to_index(&self) -> ShardIndex {
ShardIndex {
shard_number: self.shard_number,
shard_count: self.shard_count,
}
}
/// Calculate the children of this TenantShardId when splitting the overall tenant into
/// the given number of shards.
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
let mut child_shards = Vec::new();
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
// Key mapping is based on a round robin mapping of key hash modulo shard count,
// so our child shards are the ones which the same keys would map to.
if shard_number % effective_old_shard_count == self.shard_number.0 {
child_shards.push(TenantShardId {
tenant_id: self.tenant_id,
shard_number: ShardNumber(shard_number),
shard_count: new_shard_count,
})
}
}
child_shards
}
}
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02x}{:02x}",
self.0.shard_number.0, self.0.shard_count.0
)
}
}
impl std::fmt::Display for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.shard_count != ShardCount(0) {
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
} else {
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
// is distinct from the normal single shard case (shard count == 1).
self.tenant_id.fmt(f)
}
}
}
impl std::fmt::Debug for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for TenantShardId {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
if s.len() == 32 {
// Legacy case: no shard specified
Ok(Self {
tenant_id: TenantId::from_str(s)?,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
})
} else if s.len() == 37 {
let bytes = s.as_bytes();
let tenant_id = TenantId::from_hex(&bytes[0..32])?;
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
Ok(Self {
tenant_id,
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 18]> for TenantShardId {
fn from(b: [u8; 18]) -> Self {
let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
Self {
tenant_id: TenantId::from(tenant_id_bytes),
shard_number: ShardNumber(b[16]),
shard_count: ShardCount(b[17]),
}
}
}
impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
shard_number: number,
shard_count: count,
}
}
pub fn unsharded() -> Self {
Self {
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
}
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
/// For use in constructing remote storage paths: concatenate this with a TenantId
/// to get a fully qualified TenantShardId.
///
/// Backward compat: this function returns an empty string if Self::is_unsharded, such
/// that the legacy pre-sharding remote key format is preserved.
pub fn get_suffix(&self) -> String {
if self.is_unsharded() {
"".to_string()
} else {
format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
}
impl std::fmt::Display for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Debug for ShardIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Debug is the same as Display: the compact hex representation
write!(f, "{}", self)
}
}
impl std::str::FromStr for ShardIndex {
type Err = hex::FromHexError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
// Expect format: 1 byte shard number, 1 byte shard count
if s.len() == 4 {
let bytes = s.as_bytes();
let mut shard_parts: [u8; 2] = [0u8; 2];
hex::decode_to_slice(bytes, &mut shard_parts)?;
Ok(Self {
shard_number: ShardNumber(shard_parts[0]),
shard_count: ShardCount(shard_parts[1]),
})
} else {
Err(hex::FromHexError::InvalidStringLength)
}
}
}
impl From<[u8; 2]> for ShardIndex {
fn from(b: [u8; 2]) -> Self {
Self {
shard_number: ShardNumber(b[0]),
shard_count: ShardCount(b[1]),
}
}
}
impl Serialize for TenantShardId {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0;
packed[17] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for TenantShardId {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = TenantShardId;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 18])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 18] = Deserialize::deserialize(s)?;
Ok(TenantShardId::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
TenantShardId::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
18,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
impl Serialize for ShardIndex {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Binary encoding is not used in index_part.json, but is included in anticipation of
// switching various structures (e.g. inter-process communication, remote metadata) to more
// compact binary encodings in future.
let mut packed: [u8; 2] = [0; 2];
packed[0] = self.shard_number.0;
packed[1] = self.shard_count.0;
packed.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for ShardIndex {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> serde::de::Visitor<'de> for IdVisitor {
type Value = ShardIndex;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 2])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 2] = Deserialize::deserialize(s)?;
Ok(ShardIndex::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
ShardIndex::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
2,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}

View File

@@ -1,22 +0,0 @@
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("item is not a document")]
ItemIsNotADocument,
#[error(transparent)]
Serde(toml_edit::de::Error),
}
pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
where
T: serde::de::DeserializeOwned,
{
let document: toml_edit::Document = match item {
toml_edit::Item::Table(toml) => toml.clone().into(),
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
toml.clone().into_table().into()
}
_ => return Err(Error::ItemIsNotADocument),
};
toml_edit::de::from_document(document).map_err(Error::Serde)
}

View File

@@ -62,7 +62,6 @@ sync_wrapper.workspace = true
sysinfo.workspace = true
tokio-tar.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
tokio-epoll-uring.workspace = true
tokio-io-timeout.workspace = true

View File

@@ -8,7 +8,7 @@ license.workspace = true
pageserver_api.workspace = true
thiserror.workspace = true
async-trait.workspace = true
reqwest = { workspace = true, features = [ "stream" ] }
reqwest.workspace = true
utils.workspace = true
serde.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -9,8 +9,6 @@ use utils::{
lsn::Lsn,
};
pub use reqwest::Body as ReqwestBody;
pub mod util;
#[derive(Debug, Clone)]
@@ -22,9 +20,6 @@ pub struct Client {
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("send request: {0}")]
SendRequest(reqwest::Error),
#[error("receive body: {0}")]
ReceiveBody(reqwest::Error),
@@ -178,30 +173,19 @@ impl Client {
self.request(Method::GET, uri, ()).await
}
fn start_request<U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
) -> reqwest::RequestBuilder {
let req = self.client.request(method, uri);
if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
}
}
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
&self,
method: Method,
uri: U,
body: B,
) -> Result<reqwest::Response> {
self.start_request(method, uri)
.json(&body)
.send()
.await
.map_err(Error::ReceiveBody)
let req = self.client.request(method, uri);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
req
};
req.json(&body).send().await.map_err(Error::ReceiveBody)
}
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -625,53 +609,4 @@ impl Client {
}),
}
}
pub async fn import_basebackup(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
base_lsn: Lsn,
end_lsn: Lsn,
pg_version: u32,
basebackup_tarball: ReqwestBody,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
self.mgmt_api_endpoint,
);
self.start_request(Method::PUT, uri)
.body(basebackup_tarball)
.send()
.await
.map_err(Error::SendRequest)?
.error_from_body()
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn import_wal(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
wal_tarball: ReqwestBody,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
self.mgmt_api_endpoint,
);
self.start_request(Method::PUT, uri)
.body(wal_tarball)
.send()
.await
.map_err(Error::SendRequest)?
.error_from_body()
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
let toml_item = toml_document
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?;
let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let cancel = CancellationToken::new();
storage

View File

@@ -348,36 +348,35 @@ where
self.add_rel(rel, rel).await?;
}
}
}
for (path, content) in self
.timeline
.list_aux_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
{
if path.starts_with("pg_replslot") {
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
let restart_lsn = Lsn(u64::from_le_bytes(
content[offs..offs + 8].try_into().unwrap(),
));
info!("Replication slot {} restart LSN={}", path, restart_lsn);
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
} else if path == "pg_logical/replorigin_checkpoint" {
// replorigin_checkoint is written only on compute shutdown, so it contains
// deteriorated values. So we generate our own version of this file for the particular LSN
// based on information about replorigins extracted from transaction commit records.
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
// but now we should handle (skip) it for backward compatibility.
continue;
}
let header = new_tar_header(&path, content.len() as u64)?;
self.ar
.append(&header, &*content)
for (path, content) in self
.timeline
.list_aux_files(self.lsn, self.ctx)
.await
.context("could not add aux file to basebackup tarball")?;
.map_err(|e| BasebackupError::Server(e.into()))?
{
if path.starts_with("pg_replslot") {
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
let restart_lsn = Lsn(u64::from_le_bytes(
content[offs..offs + 8].try_into().unwrap(),
));
info!("Replication slot {} restart LSN={}", path, restart_lsn);
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
} else if path == "pg_logical/replorigin_checkpoint" {
// replorigin_checkoint is written only on compute shutdown, so it contains
// deteriorated values. So we generate our own version of this file for the particular LSN
// based on information about replorigins extracted from transaction commit records.
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
// but now we should handle (skip) it for backward compatibility.
continue;
}
let header = new_tar_header(&path, content.len() as u64)?;
self.ar
.append(&header, &*content)
.await
.context("could not add aux file to basebackup tarball")?;
}
}
if min_restart_lsn != Lsn::MAX {
info!(
"Min restart LSN for logical replication is {}",

View File

@@ -47,9 +47,6 @@ use utils::{
project_git_version!(GIT_VERSION);
project_build_tag!(BUILD_TAG);
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
@@ -424,10 +421,6 @@ fn start_pageserver(
background_jobs_can_start: background_jobs_barrier.clone(),
};
info!(config=?conf.l0_flush, "using l0_flush config");
let l0_flush_global_state =
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -436,7 +429,6 @@ fn start_pageserver(
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
},
order,
shutdown_pageserver.clone(),
@@ -660,6 +652,7 @@ fn start_pageserver(
async move {
page_service::libpq_listener_main(
tenant_manager,
broker_client,
pg_auth,
pageserver_listener,
conf.pg_auth_type,

View File

@@ -5,7 +5,7 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
use pageserver_api::shard::TenantShardId;
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde;
use serde::de::IntoDeserializer;
@@ -30,11 +30,11 @@ use utils::{
logging::LogFormat,
};
use crate::tenant::timeline::GetVectoredImpl;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
use crate::{tenant::config::TenantConf, virtual_file};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
@@ -50,7 +50,6 @@ pub mod defaults {
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT,
};
use pageserver_api::models::ImageCompressionAlgorithm;
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -91,9 +90,6 @@ pub mod defaults {
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::DisabledNoDecompress;
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -163,7 +159,7 @@ pub mod defaults {
#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
#[remote_storage]
[remote_storage]
"#
);
@@ -289,16 +285,12 @@ pub struct PageServerConf {
pub validate_vectored_get: bool,
pub image_compression: ImageCompressionAlgorithm,
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
/// is exceeded, we start proactively closing ephemeral layers to limit the total amount
/// of ephemeral data.
///
/// Setting this to zero disables limits on total ephemeral layer size.
pub ephemeral_bytes_per_memory_kb: usize,
pub l0_flush: L0FlushConfig,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -403,11 +395,7 @@ struct PageServerConfigBuilder {
validate_vectored_get: BuilderValue<bool>,
image_compression: BuilderValue<ImageCompressionAlgorithm>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
l0_flush: BuilderValue<L0FlushConfig>,
}
impl PageServerConfigBuilder {
@@ -494,10 +482,8 @@ impl PageServerConfigBuilder {
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
}
}
}
@@ -681,18 +667,10 @@ impl PageServerConfigBuilder {
self.validate_vectored_get = BuilderValue::Set(value);
}
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
self.image_compression = BuilderValue::Set(value);
}
pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
}
pub fn l0_flush(&mut self, value: L0FlushConfig) {
self.l0_flush = BuilderValue::Set(value);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
@@ -749,9 +727,7 @@ impl PageServerConfigBuilder {
get_impl,
max_vectored_read_bytes,
validate_vectored_get,
image_compression,
ephemeral_bytes_per_memory_kb,
l0_flush,
}
CUSTOM LOGIC
{
@@ -942,7 +918,7 @@ impl PageServerConf {
"http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
"pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
"remote_storage" => {
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
}
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -970,7 +946,7 @@ impl PageServerConf {
builder.metric_collection_endpoint(Some(endpoint));
},
"metric_collection_bucket" => {
builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
}
"synthetic_size_calculation_interval" =>
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1028,15 +1004,9 @@ impl PageServerConf {
"validate_vectored_get" => {
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
}
"image_compression" => {
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
}
"ephemeral_bytes_per_memory_kb" => {
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
}
"l0_flush" => {
builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1118,10 +1088,8 @@ impl PageServerConf {
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant"),
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
}
}
}
@@ -1360,9 +1328,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -1435,9 +1401,7 @@ background_task_maximum_delay = '334 s'
.expect("Invalid default constant")
),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
},
"Should be able to parse all basic config values correctly"
);
@@ -1717,19 +1681,6 @@ threshold = "20m"
}
}
#[test]
fn empty_remote_storage_is_error() {
let tempdir = tempdir().unwrap();
let (workdir, _) = prepare_fs(&tempdir).unwrap();
let input = r#"
remote_storage = {}
"#;
let doc = toml_edit::Document::from_str(input).unwrap();
let err = PageServerConf::parse_and_validate(&doc, &workdir)
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
assert!(format!("{err}").contains("remote_storage"), "{err}");
}
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
let tempdir_path = tempdir.path();

View File

@@ -190,7 +190,7 @@ where
}
} else {
// If we failed validation, then do not apply any of the projected updates
info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
@@ -225,7 +225,7 @@ where
&& (tenant.generation == *validated_generation);
if !this_list_valid {
info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
} else {

View File

@@ -265,19 +265,15 @@ paths:
type: string
format: hex
post:
description: Obtains a lease for the given LSN.
requestBody:
content:
application/json:
schema:
type: object
required:
- lsn
properties:
lsn:
description: A LSN to obtain the lease for.
type: string
format: hex
description: Obtain lease for the given LSN
parameters:
- name: lsn
in: query
required: true
schema:
type: string
format: hex
description: A LSN to obtain the lease for
responses:
"200":
description: OK

View File

@@ -10,7 +10,6 @@ use std::time::Duration;
use anyhow::{anyhow, Context, Result};
use enumset::EnumSet;
use futures::StreamExt;
use futures::TryFutureExt;
use humantime::format_rfc3339;
use hyper::header;
@@ -23,7 +22,6 @@ use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
@@ -44,15 +42,13 @@ use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use remote_storage::TimeTravelError;
use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
use tokio_util::io::StreamReader;
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::auth::JwtAuth;
use utils::failpoint_support::failpoints_handler;
use utils::http::endpoint::prometheus_metrics_handler;
use utils::http::endpoint::request_span;
use utils::http::request::must_parse_query_param;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use crate::context::{DownloadBehavior, RequestContext};
@@ -231,7 +227,7 @@ impl From<UpsertLocationError> for ApiError {
BadRequest(e) => ApiError::BadRequest(e),
Unavailable(_) => ApiError::ShuttingDown,
e @ InProgress => ApiError::Conflict(format!("{e}")),
Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
Flush(e) | Other(e) => ApiError::InternalServerError(e),
}
}
}
@@ -410,8 +406,6 @@ async fn build_timeline_info_common(
let walreceiver_status = timeline.walreceiver_status();
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
@@ -432,8 +426,6 @@ async fn build_timeline_info_common(
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
current_physical_size,
current_logical_size_non_incremental: None,
pitr_history_size,
within_ancestor_pitr,
timeline_dir_layer_file_size_sum: None,
wal_source_connstr,
last_received_msg_lsn,
@@ -1199,15 +1191,10 @@ fn synthetic_size_html_response(
timeline_map.insert(ti.timeline_id, index);
timeline_ids.push(ti.timeline_id.to_string());
}
let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
let seg_to_branch: Vec<usize> = inputs
.segments
.iter()
.map(|seg| {
(
*timeline_map.get(&seg.timeline_id).unwrap(),
seg.kind.into(),
)
})
.map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
.collect();
let svg =
@@ -1309,7 +1296,7 @@ async fn update_tenant_config_handler(
crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
.map_err(ApiError::InternalServerError)?;
tenant.set_new_tenant_config(new_tenant_conf);
json_response(StatusCode::OK, ())
@@ -1540,13 +1527,15 @@ async fn handle_tenant_break(
// Obtains an lsn lease on the given timeline.
async fn lsn_lease_handler(
mut request: Request<Body>,
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
let lsn: Lsn = parse_query_param(&request, "lsn")?
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -2407,189 +2396,6 @@ async fn post_top_tenants(
)
}
async fn put_tenant_timeline_import_basebackup(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
async move {
let state = get_state(&request);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
let broker_client = state.broker_client.clone();
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
})
}));
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.map_err(ApiError::InternalServerError)
.await?;
// TODO mark timeline as not ready until it reaches end_lsn.
// We might have some wal to import as well, and we should prevent compute
// from connecting before that and writing conflicting wal.
//
// This is not relevant for pageserver->pageserver migrations, since there's
// no wal to import. But should be fixed if we want to import from postgres.
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
// Import basebackup provided via CopyData
info!("importing basebackup");
timeline
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
// Read the end of the tar archive.
read_tar_eof(body)
.await
.map_err(ApiError::InternalServerError)?;
// TODO check checksum
// Meanwhile you can verify client-side by taking fullbackup
// and checking that it matches in size with what was imported.
// It wouldn't work if base came from vanilla postgres though,
// since we discard some log files.
info!("done");
json_response(StatusCode::OK, ())
}
.instrument(span)
.await
}
async fn put_tenant_timeline_import_wal(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
async move {
let state = get_state(&request);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
})
}));
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
}
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
// Import wal provided via CopyData
info!("importing wal");
crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
info!("wal import complete");
// Read the end of the tar archive.
read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
// TODO Does it make sense to overshoot?
if timeline.get_last_record_lsn() < end_lsn {
return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
}
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.freeze_and_flush().await.map_err(|e| match e {
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})?;
info!("done");
json_response(StatusCode::OK, ())
}.instrument(span).await
}
/// Read the end of a tar archive.
///
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
/// and check that there is no more data after the EOF marker.
///
/// 'tar' command can also write extra blocks of zeros, up to a record
/// size, controlled by the --record-size argument. Ignore them too.
async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
use tokio::io::AsyncReadExt;
let mut buf = [0u8; 512];
// Read the all-zeros block, and verify it
let mut total_bytes = 0;
while total_bytes < 512 {
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
total_bytes += nbytes;
if nbytes == 0 {
break;
}
}
if total_bytes < 512 {
anyhow::bail!("incomplete or invalid tar EOF marker");
}
if !buf.iter().all(|&x| x == 0) {
anyhow::bail!("invalid tar EOF marker");
}
// Drain any extra zero-blocks after the EOF marker
let mut trailing_bytes = 0;
let mut seen_nonzero_bytes = false;
loop {
let nbytes = reader.read(&mut buf).await?;
trailing_bytes += nbytes;
if !buf.iter().all(|&x| x == 0) {
seen_nonzero_bytes = true;
}
if nbytes == 0 {
break;
}
}
if seen_nonzero_bytes {
anyhow::bail!("unexpected non-zero bytes after the tar archive");
}
if trailing_bytes % 512 != 0 {
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
}
Ok(())
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
@@ -2884,13 +2690,5 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|r| testing_api_handler("perf_info", r, perf_info),
)
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
|r| api_handler(r, put_tenant_timeline_import_basebackup),
)
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
|r| api_handler(r, put_tenant_timeline_import_wal),
)
.any(handler_404))
}

View File

@@ -1,46 +0,0 @@
use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[default]
PageCached,
#[serde(rename_all = "snake_case")]
Direct { max_concurrency: NonZeroUsize },
}
#[derive(Clone)]
pub struct L0FlushGlobalState(Arc<Inner>);
pub(crate) enum Inner {
PageCached,
Direct { semaphore: tokio::sync::Semaphore },
}
impl L0FlushGlobalState {
pub fn new(config: L0FlushConfig) -> Self {
match config {
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
L0FlushConfig::Direct { max_concurrency } => {
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
Self(Arc::new(Inner::Direct { semaphore }))
}
}
}
pub(crate) fn inner(&self) -> &Arc<Inner> {
&self.0
}
}
impl L0FlushConfig {
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
use L0FlushConfig::*;
match self {
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
}
}
}

View File

@@ -11,7 +11,6 @@ pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod l0_flush;
pub use pageserver_api::keyspace;
pub mod aux_file;
pub mod metrics;

View File

@@ -8,7 +8,7 @@ use metrics::{
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use strum::{EnumCount, VariantNames};
use strum::{EnumCount, IntoEnumIterator, VariantNames};
use strum_macros::{EnumVariantNames, IntoStaticStr};
use tracing::warn;
use utils::id::TimelineId;
@@ -464,24 +464,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_pitr_history_size",
"Data written since PITR cutoff on this timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_archive_size",
"Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_standby_horizon",
@@ -494,7 +476,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_resident_physical_size",
"The size of the layer files present in the pageserver's filesystem, for attached locations.",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
@@ -1094,12 +1076,21 @@ pub(crate) mod virtual_file_io_engine {
});
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
per_tenant_timeline: Histogram,
}
impl GlobalAndPerTimelineHistogram {
fn observe(&self, value: f64) {
self.global.observe(value);
self.per_tenant_timeline.observe(value);
}
}
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
global_metric: &'a Histogram,
// Optional because not all op types are tracked per-timeline
timeline_metric: Option<&'a Histogram>,
h: &'a GlobalAndPerTimelineHistogram,
ctx: &'c RequestContext,
start: std::time::Instant,
op: SmgrQueryType,
@@ -1130,10 +1121,7 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
elapsed
}
};
self.global_metric.observe(ex_throttled.as_secs_f64());
if let Some(timeline_metric) = self.timeline_metric {
timeline_metric.observe(ex_throttled.as_secs_f64());
}
self.h.observe(ex_throttled.as_secs_f64());
}
}
@@ -1158,8 +1146,7 @@ pub enum SmgrQueryType {
#[derive(Debug)]
pub(crate) struct SmgrQueryTimePerTimeline {
global_metrics: [Histogram; SmgrQueryType::COUNT],
per_timeline_getpage: Histogram,
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
}
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1237,32 +1224,27 @@ impl SmgrQueryTimePerTimeline {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
let timeline_id = timeline_id.to_string();
let global_metrics = std::array::from_fn(|i| {
let metrics = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
SMGR_QUERY_TIME_GLOBAL
let global = SMGR_QUERY_TIME_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap()
.unwrap();
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
.unwrap();
GlobalAndPerTimelineHistogram {
global,
per_tenant_timeline,
}
});
let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
&tenant_id,
&shard_slug,
&timeline_id,
])
.unwrap();
Self {
global_metrics,
per_timeline_getpage,
}
Self { metrics }
}
pub(crate) fn start_timer<'c: 'a, 'a>(
&'a self,
op: SmgrQueryType,
ctx: &'c RequestContext,
) -> Option<impl Drop + '_> {
let global_metric = &self.global_metrics[op as usize];
) -> impl Drop + '_ {
let metric = &self.metrics[op as usize];
let start = Instant::now();
match ctx.micros_spent_throttled.open() {
Ok(()) => (),
@@ -1281,20 +1263,12 @@ impl SmgrQueryTimePerTimeline {
});
}
}
let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
Some(&self.per_timeline_getpage)
} else {
None
};
Some(GlobalAndPerTimelineHistogramTimer {
global_metric,
timeline_metric,
GlobalAndPerTimelineHistogramTimer {
h: metric,
ctx,
start,
op,
})
}
}
}
@@ -1341,9 +1315,17 @@ mod smgr_query_time_tests {
let get_counts = || {
let global: u64 = ops
.iter()
.map(|op| metrics.global_metrics[*op as usize].get_sample_count())
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
.sum();
(global, metrics.per_timeline_getpage.get_sample_count())
let per_tenant_timeline: u64 = ops
.iter()
.map(|op| {
metrics.metrics[*op as usize]
.per_tenant_timeline
.get_sample_count()
})
.sum();
(global, per_tenant_timeline)
};
let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1354,12 +1336,7 @@ mod smgr_query_time_tests {
drop(timer);
let (post_global, post_per_tenant_timeline) = get_counts();
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
// getpage ops are tracked per-timeline, others aren't
assert_eq!(post_per_tenant_timeline, 1);
} else {
assert_eq!(post_per_tenant_timeline, 0);
}
assert_eq!(post_per_tenant_timeline, 1);
assert!(post_global > pre_global);
}
}
@@ -1456,12 +1433,10 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
}
}
pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_live_connections_started",
"Number of network connections that we started handling",
"pageserver_live_connections_finished",
"Number of network connections that we finished handling",
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
&["pageserver_connection_kind"]
)
.expect("failed to define a metric")
@@ -1472,7 +1447,10 @@ pub(crate) enum ComputeCommandKind {
PageStreamV2,
PageStream,
Basebackup,
GetLastRecordRlsn,
Fullbackup,
ImportBasebackup,
ImportWal,
LeaseLsn,
Show,
}
@@ -1713,15 +1691,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
}
});
pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_secondary_resident_physical_size",
"The size of the layer files present in the pageserver's filesystem, for secondary locations.",
&["tenant_id", "shard_id"]
)
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -2124,8 +2093,6 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -2199,15 +2166,6 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let pitr_history_size = PITR_HISTORY_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let archival_size = TIMELINE_ARCHIVE_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2260,8 +2218,6 @@ impl TimelineMetrics {
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
pitr_history_size,
archival_size,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
@@ -2319,10 +2275,6 @@ impl TimelineMetrics {
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2356,12 +2308,14 @@ impl TimelineMetrics {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
}
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
SmgrQueryType::GetPageAtLsn.into(),
tenant_id,
shard_id,
timeline_id,
]);
for op in SmgrQueryType::iter() {
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
op.into(),
tenant_id,
shard_id,
timeline_id,
]);
}
}
}

View File

@@ -4,7 +4,9 @@
use anyhow::Context;
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use bytes::Bytes;
use futures::stream::FuturesUnordered;
use futures::Stream;
use futures::StreamExt;
use pageserver_api::key::Key;
use pageserver_api::models::TenantState;
@@ -26,6 +28,7 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::io;
use std::net::TcpListener;
use std::pin::pin;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
@@ -34,6 +37,7 @@ use std::time::Instant;
use std::time::SystemTime;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::ConnectionId;
@@ -49,8 +53,9 @@ use crate::auth::check_permission;
use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
use crate::pgdatadir_mapping::Version;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -61,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::FlushLayerError;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
@@ -76,6 +82,56 @@ use postgres_ffi::BLCKSZ;
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
/// Read the end of a tar archive.
///
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
/// and check that there is no more data after the EOF marker.
///
/// 'tar' command can also write extra blocks of zeros, up to a record
/// size, controlled by the --record-size argument. Ignore them too.
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
use tokio::io::AsyncReadExt;
let mut buf = [0u8; 512];
// Read the all-zeros block, and verify it
let mut total_bytes = 0;
while total_bytes < 512 {
let nbytes = reader.read(&mut buf[total_bytes..]).await?;
total_bytes += nbytes;
if nbytes == 0 {
break;
}
}
if total_bytes < 512 {
anyhow::bail!("incomplete or invalid tar EOF marker");
}
if !buf.iter().all(|&x| x == 0) {
anyhow::bail!("invalid tar EOF marker");
}
// Drain any extra zero-blocks after the EOF marker
let mut trailing_bytes = 0;
let mut seen_nonzero_bytes = false;
loop {
let nbytes = reader.read(&mut buf).await?;
trailing_bytes += nbytes;
if !buf.iter().all(|&x| x == 0) {
seen_nonzero_bytes = true;
}
if nbytes == 0 {
break;
}
}
if seen_nonzero_bytes {
anyhow::bail!("unexpected non-zero bytes after the tar archive");
}
if trailing_bytes % 512 != 0 {
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
}
Ok(())
}
///////////////////////////////////////////////////////////////////////////////
///
@@ -85,6 +141,7 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
///
pub async fn libpq_listener_main(
tenant_manager: Arc<TenantManager>,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
listener: TcpListener,
auth_type: AuthType,
@@ -129,6 +186,7 @@ pub async fn libpq_listener_main(
false,
page_service_conn_main(
tenant_manager.clone(),
broker_client.clone(),
local_auth,
socket,
auth_type,
@@ -151,14 +209,20 @@ pub async fn libpq_listener_main(
#[instrument(skip_all, fields(peer_addr))]
async fn page_service_conn_main(
tenant_manager: Arc<TenantManager>,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
connection_ctx: RequestContext,
) -> anyhow::Result<()> {
let _guard = LIVE_CONNECTIONS
.with_label_values(&["page_service"])
.guard();
// Immediately increment the gauge, then create a job to decrement it on task exit.
// One of the pros of `defer!` is that this will *most probably*
// get called, even in presence of panics.
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
socket
.set_nodelay(true)
@@ -203,11 +267,12 @@ async fn page_service_conn_main(
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
let mut conn_handler =
PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend
.run(&mut conn_handler, &task_mgr::shutdown_token())
.run(&mut conn_handler, task_mgr::shutdown_watcher)
.await
{
Ok(()) => {
@@ -234,6 +299,7 @@ struct HandlerTimeline {
}
struct PageServerHandler {
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
claims: Option<Claims>,
@@ -325,11 +391,13 @@ impl From<WaitLsnError> for QueryError {
impl PageServerHandler {
pub fn new(
tenant_manager: Arc<TenantManager>,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
connection_ctx: RequestContext,
) -> Self {
PageServerHandler {
tenant_manager,
broker_client,
auth,
claims: None,
connection_ctx,
@@ -412,6 +480,73 @@ impl PageServerHandler {
)
}
fn copyin_stream<'a, IO>(
&'a self,
pgb: &'a mut PostgresBackend<IO>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = io::Result<Bytes>> + 'a
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
async_stream::try_stream! {
loop {
let msg = tokio::select! {
biased;
_ = cancel.cancelled() => {
// We were requested to shut down.
let msg = "pageserver is shutting down";
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
Err(QueryError::Shutdown)
}
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
};
match msg {
Ok(Some(message)) => {
let copy_data_bytes = match message {
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = "client terminated connection with Terminate message during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {m:?}");
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
break;
}
};
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection during COPY";
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
Err(io_error)?;
}
Err(other) => {
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
}
};
}
}
}
#[instrument(skip_all)]
async fn handle_pagerequests<IO>(
&mut self,
@@ -583,6 +718,128 @@ impl PageServerHandler {
Ok(())
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
async fn handle_import_basebackup<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
base_lsn: Lsn,
_end_lsn: Lsn,
pg_version: u32,
ctx: RequestContext,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
// Create empty timeline
info!("creating new timeline");
let tenant = self
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
.await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.await?;
// TODO mark timeline as not ready until it reaches end_lsn.
// We might have some wal to import as well, and we should prevent compute
// from connecting before that and writing conflicting wal.
//
// This is not relevant for pageserver->pageserver migrations, since there's
// no wal to import. But should be fixed if we want to import from postgres.
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
// Import basebackup provided via CopyData
info!("importing basebackup");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb, &tenant.cancel).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
timeline
.import_basebackup_from_tar(
tenant.clone(),
&mut copyin_reader,
base_lsn,
self.broker_client.clone(),
&ctx,
)
.await?;
// Read the end of the tar archive.
read_tar_eof(copyin_reader).await?;
// TODO check checksum
// Meanwhile you can verify client-side by taking fullbackup
// and checking that it matches in size with what was imported.
// It wouldn't work if base came from vanilla postgres though,
// since we discard some log files.
info!("done");
Ok(())
}
#[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
async fn handle_import_wal<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
ctx: RequestContext,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// TODO leave clean state on error. For now you can use detach to clean
// up broken state from a failed import.
// Import wal provided via CopyData
info!("importing wal");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete");
// Read the end of the tar archive.
read_tar_eof(copyin_reader).await?;
// TODO Does it make sense to overshoot?
if timeline.get_last_record_lsn() < end_lsn {
return Err(QueryError::Other(
anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
);
}
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.freeze_and_flush().await.map_err(|e| match e {
FlushLayerError::Cancelled => QueryError::Shutdown,
other => QueryError::Other(other.into()),
})?;
info!("done");
Ok(())
}
/// Helper function to handle the LSN from client request.
///
/// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1399,6 +1656,53 @@ where
metric_recording.observe(&res);
res?;
}
// return pair of prev_lsn and last_lsn
else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for get_last_record_rlsn command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::GetLastRecordRlsn)
.inc();
async {
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
let end_of_timeline = timeline.get_last_record_rlsn();
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::text_col(b"prev_lsn"),
RowDescriptor::text_col(b"last_lsn"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(end_of_timeline.prev.to_string().as_bytes()),
Some(end_of_timeline.last.to_string().as_bytes()),
]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
anyhow::Ok(())
}
.instrument(info_span!(
"handle_get_last_record_lsn",
shard_id = tracing::field::Empty
))
.await?;
}
// same as basebackup, but result includes relational data as well
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
if params.len() < 2 {
@@ -1453,6 +1757,109 @@ where
)
.await?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("import basebackup ") {
// Import the `base` section (everything but the wal) of a basebackup.
// Assumes the tenant already exists on this pageserver.
//
// Files are scheduled to be persisted to remote storage, and the
// caller should poll the http api to check when that is done.
//
// Example import command:
// 1. Get start/end LSN from backup_manifest file
// 2. Run:
// cat my_backup/base.tar | psql -h $PAGESERVER \
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
let params = &parts[2..];
if params.len() != 5 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import basebackup command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let base_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
let pg_version = u32::from_str(params[4])
.with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::ImportBasebackup)
.inc();
match self
.handle_import_basebackup(
pgb,
tenant_id,
timeline_id,
base_lsn,
end_lsn,
pg_version,
ctx,
)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.starts_with("import wal ") {
// Import the `pg_wal` section of a basebackup.
//
// Files are scheduled to be persisted to remote storage, and the
// caller should poll the http api to check when that is done.
let params = &parts[2..];
if params.len() != 4 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for import wal command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
let start_lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
let end_lsn = Lsn::from_str(params[3])
.with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::ImportWal)
.inc();
match self
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
.await
{
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(
&e.to_string(),
Some(e.pg_error_code()),
))?
}
};
} else if query_string.to_ascii_lowercase().starts_with("set ") {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
// on connect

View File

@@ -854,14 +854,13 @@ impl Timeline {
result.add_key(DBDIR_KEY);
// Fetch list of database dirs and iterate them
let dbdir = self.list_dbdirs(lsn, ctx).await?;
let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf)?;
dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
for ((spcnode, dbnode), has_relmap_file) in dbs {
if has_relmap_file {
result.add_key(relmap_file_key(spcnode, dbnode));
}
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
dbs.sort_unstable();
for (spcnode, dbnode) in dbs {
result.add_key(relmap_file_key(spcnode, dbnode));
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
@@ -920,9 +919,6 @@ impl Timeline {
result.add_key(AUX_FILES_KEY);
}
// Add extra keyspaces in the test cases. Some test cases write keys into the storage without
// creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
// and the keys will not be garbage-colllected.
#[cfg(test)]
{
let guard = self.extra_test_dense_keyspace.load();
@@ -931,48 +927,13 @@ impl Timeline {
}
}
let dense_keyspace = result.to_keyspace();
let sparse_keyspace = SparseKeySpace(KeySpace {
ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
});
if cfg!(debug_assertions) {
// Verify if the sparse keyspaces are ordered and non-overlapping.
// We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
// category of sparse keys are split into their own image/delta files. If there
// are overlapping keyspaces, they will be automatically merged by keyspace accum,
// and we want the developer to keep the keyspaces separated.
let ranges = &sparse_keyspace.0.ranges;
// TODO: use a single overlaps_with across the codebase
fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
for i in 0..ranges.len() {
for j in 0..i {
if overlaps_with(&ranges[i], &ranges[j]) {
panic!(
"overlapping sparse keyspace: {}..{} and {}..{}",
ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
);
}
}
}
for i in 1..ranges.len() {
assert!(
ranges[i - 1].end <= ranges[i].start,
"unordered sparse keyspace: {}..{} and {}..{}",
ranges[i - 1].start,
ranges[i - 1].end,
ranges[i].start,
ranges[i].end
);
}
}
Ok((dense_keyspace, sparse_keyspace))
Ok((
result.to_keyspace(),
/* AUX sparse key space */
SparseKeySpace(KeySpace {
ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
}),
))
}
/// Get cached size of relation if it not updated after specified LSN

View File

@@ -73,7 +73,6 @@ use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::TENANT;
use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
@@ -167,7 +166,6 @@ pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: GenericRemoteStorage,
pub deletion_queue_client: DeletionQueueClient,
pub l0_flush_global_state: L0FlushGlobalState,
}
/// A [`Tenant`] is really an _attached_ tenant. The configuration
@@ -296,8 +294,6 @@ pub struct Tenant {
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
l0_flush_global_state: L0FlushGlobalState,
}
impl std::fmt::Debug for Tenant {
@@ -533,15 +529,6 @@ impl From<PageReconstructError> for GcError {
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum LoadConfigError {
#[error("TOML deserialization error: '{0}'")]
DeserializeToml(#[from] toml_edit::de::Error),
#[error("Config not found at {0}")]
NotFound(Utf8PathBuf),
}
impl Tenant {
/// Yet another helper for timeline initialization.
///
@@ -680,7 +667,6 @@ impl Tenant {
broker_client,
remote_storage,
deletion_queue_client,
l0_flush_global_state,
} = resources;
let attach_mode = attached_conf.location.attach_mode;
@@ -695,7 +681,6 @@ impl Tenant {
tenant_shard_id,
remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
));
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -995,7 +980,6 @@ impl Tenant {
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
},
ctx,
)
@@ -1365,7 +1349,7 @@ impl Tenant {
initdb_lsn: Lsn,
pg_version: u32,
ctx: &RequestContext,
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
@@ -1816,15 +1800,9 @@ impl Tenant {
// If we're still attaching, fire the cancellation token early to drop out: this
// will prevent us flushing, but ensures timely shutdown if some I/O during attach
// is very slow.
let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
if matches!(self.current_state(), TenantState::Attaching) {
self.cancel.cancel();
// Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
// are children of ours, so their flush loops will have shut down already
timeline::ShutdownMode::Hard
} else {
shutdown_mode
};
}
match self.set_stopping(shutdown_progress, false, false).await {
Ok(()) => {}
@@ -2491,7 +2469,6 @@ impl Tenant {
tenant_shard_id: TenantShardId,
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
) -> Tenant {
debug_assert!(
!attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2579,7 +2556,6 @@ impl Tenant {
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
l0_flush_global_state,
}
}
@@ -2587,35 +2563,36 @@ impl Tenant {
pub(super) fn load_tenant_config(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
) -> Result<LocationConf, LoadConfigError> {
) -> anyhow::Result<LocationConf> {
let config_path = conf.tenant_location_config_path(tenant_shard_id);
info!("loading tenant configuration from {config_path}");
if config_path.exists() {
// New-style config takes precedence
let deserialized = Self::read_config(&config_path)?;
Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
} else {
// The config should almost always exist for a tenant directory:
// - When attaching a tenant, the config is the first thing we write
// - When detaching a tenant, we atomically move the directory to a tmp location
// before deleting contents.
//
// The very rare edge case that can result in a missing config is if we crash during attach
// between creating directory and writing config. Callers should handle that as if the
// directory didn't exist.
anyhow::bail!("tenant config not found in {}", config_path);
}
}
fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
info!("loading tenant configuration from {path}");
// load and parse file
let config = fs::read_to_string(&config_path).map_err(|e| {
match e.kind() {
std::io::ErrorKind::NotFound => {
// The config should almost always exist for a tenant directory:
// - When attaching a tenant, the config is the first thing we write
// - When detaching a tenant, we atomically move the directory to a tmp location
// before deleting contents.
//
// The very rare edge case that can result in a missing config is if we crash during attach
// between creating directory and writing config. Callers should handle that as if the
// directory didn't exist.
let config = fs::read_to_string(path)
.with_context(|| format!("Failed to load config from path '{path}'"))?;
LoadConfigError::NotFound(config_path)
}
_ => {
// No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
// that we cannot cleanly recover
crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
}
}
})?;
Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
config
.parse::<toml_edit::Document>()
.with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
}
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2623,7 +2600,7 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
location_conf: &LocationConf,
) -> std::io::Result<()> {
) -> anyhow::Result<()> {
let config_path = conf.tenant_location_config_path(tenant_shard_id);
Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2634,7 +2611,7 @@ impl Tenant {
tenant_shard_id: &TenantShardId,
config_path: &Utf8Path,
location_conf: &LocationConf,
) -> std::io::Result<()> {
) -> anyhow::Result<()> {
debug!("persisting tenantconf to {config_path}");
let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2643,20 +2620,22 @@ impl Tenant {
.to_string();
fail::fail_point!("tenant-config-before-write", |_| {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
"tenant-config-before-write",
))
anyhow::bail!("tenant-config-before-write");
});
// Convert the config to a toml file.
conf_content +=
&toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
let tenant_shard_id = *tenant_shard_id;
let config_path = config_path.to_owned();
let conf_content = conf_content.into_bytes();
VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
.await
.with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
Ok(())
}
//
@@ -2874,7 +2853,6 @@ impl Tenant {
{
let mut target = timeline.gc_info.write().unwrap();
// Cull any expired leases
let now = SystemTime::now();
target.leases.retain(|_, lease| !lease.is_expired(&now));
@@ -2883,31 +2861,6 @@ impl Tenant {
.valid_lsn_lease_count_gauge
.set(target.leases.len() as u64);
// Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
target.within_ancestor_pitr =
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
}
}
// Update metrics that depend on GC state
timeline
.metrics
.archival_size
.set(if target.within_ancestor_pitr {
timeline.metrics.current_logical_size_gauge.get()
} else {
0
});
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.pitr)
.unwrap_or(Lsn(0))
.0,
);
match gc_cutoffs.remove(&timeline.timeline_id) {
Some(cutoffs) => {
target.retain_lsns = branchpoints;
@@ -2959,7 +2912,7 @@ impl Tenant {
dst_id: TimelineId,
ancestor_lsn: Option<Lsn>,
ctx: &RequestContext,
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
@@ -3343,7 +3296,6 @@ impl Tenant {
TimelineResources {
remote_client,
timeline_get_throttle: self.timeline_get_throttle.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
}
}
@@ -3680,7 +3632,6 @@ pub(crate) mod harness {
use utils::logging;
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::l0_flush::L0FlushConfig;
use crate::walredo::apply_neon;
use crate::{repository::Key, walrecord::NeonWalRecord};
@@ -3870,8 +3821,6 @@ pub(crate) mod harness {
self.tenant_shard_id,
self.remote_storage.clone(),
self.deletion_queue.new_client(),
// TODO: ideally we should run all unit tests with both configs
L0FlushGlobalState::new(L0FlushConfig::default()),
));
let preload = tenant
@@ -3959,7 +3908,7 @@ mod tests {
use storage_layer::PersistentLayerKey;
use tests::storage_layer::ValuesReconstructState;
use tests::timeline::{GetVectoredError, ShutdownMode};
use timeline::{DeltaLayerTestDesc, GcInfo};
use timeline::GcInfo;
use utils::bin_ser::BeSer;
use utils::id::TenantId;
@@ -6255,6 +6204,27 @@ mod tests {
.await
.unwrap();
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let mut reconstruct_state = ValuesReconstructState::new();
let mut res = tline
.get_vectored_impl(
KeySpace::single(key..key.next()),
lsn,
&mut reconstruct_state,
ctx,
)
.await?;
Ok(res.pop_last().map(|(k, v)| {
assert_eq!(k, key);
v.unwrap()
}))
}
let lsn = Lsn(0x30);
// test vectored get on parent timeline
@@ -6330,6 +6300,27 @@ mod tests {
.await
.unwrap();
async fn get_vectored_impl_wrapper(
tline: &Arc<Timeline>,
key: Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let mut reconstruct_state = ValuesReconstructState::new();
let mut res = tline
.get_vectored_impl(
KeySpace::single(key..key.next()),
lsn,
&mut reconstruct_state,
ctx,
)
.await?;
Ok(res.pop_last().map(|(k, v)| {
assert_eq!(k, key);
v.unwrap()
}))
}
let lsn = Lsn(0x30);
// test vectored get on parent timeline
@@ -6405,18 +6396,9 @@ mod tests {
&ctx,
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x20),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
],
// image layers
vec![
@@ -6482,29 +6464,17 @@ mod tests {
&ctx,
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x20),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x30)..Lsn(0x40),
vec![
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
],
),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
vec![
(key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
(key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
],
],
// image layers
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
Lsn(0x40),
Lsn(0x30),
)
.await
.unwrap();
@@ -6527,7 +6497,7 @@ mod tests {
// Image layers are created at last_record_lsn
let images = tline
.inspect_image_layers(Lsn(0x40), &ctx)
.inspect_image_layers(Lsn(0x30), &ctx)
.await
.unwrap()
.into_iter()
@@ -6553,18 +6523,9 @@ mod tests {
&ctx,
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x20),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
),
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
),
vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
],
// image layers
vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6612,21 +6573,15 @@ mod tests {
key
}
// We create
// - one bottom-most image layer,
// - a delta layer D1 crossing the GC horizon with data below and above the horizon,
// - a delta layer D2 crossing the GC horizon with data only below the horizon,
// - a delta layer D3 above the horizon.
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
//
// | D3 |
// | D1 |
// | D1 | | D3 |
// -| |-- gc horizon -----------------
// | | | D2 |
// --------- img layer ------------------
//
// What we should expact from this compaction is:
// | D3 |
// | Part of D1 |
// | Part of D1 | | D3 |
// --------- img layer with D1+D2 at GC horizon------------------
// img layer at 0x10
@@ -6666,13 +6621,13 @@ mod tests {
let delta3 = vec![
(
get_key(8),
Lsn(0x48),
Value::Image(Bytes::from("value 8@0x48")),
Lsn(0x40),
Value::Image(Bytes::from("value 8@0x40")),
),
(
get_key(9),
Lsn(0x48),
Value::Image(Bytes::from("value 9@0x48")),
Lsn(0x40),
Value::Image(Bytes::from("value 9@0x40")),
),
];
@@ -6682,11 +6637,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
], // delta layers
vec![delta1, delta2, delta3], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
@@ -6707,8 +6658,8 @@ mod tests {
Bytes::from_static(b"value 5@0x20"),
Bytes::from_static(b"value 6@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x48"),
Bytes::from_static(b"value 9@0x48"),
Bytes::from_static(b"value 8@0x40"),
Bytes::from_static(b"value 9@0x40"),
];
for (idx, expected) in expected_result.iter().enumerate() {
@@ -6796,10 +6747,10 @@ mod tests {
lsn_range: Lsn(0x30)..Lsn(0x41),
is_delta: true
},
// The delta3 layer that should not be picked for the compaction
// The delta layer we created and should not be picked for the compaction
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
lsn_range: Lsn(0x40)..Lsn(0x41),
is_delta: true
}
]
@@ -6863,10 +6814,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x40),
delta1,
)], // delta layers
vec![delta1], // delta layers
vec![(Lsn(0x10), image1)], // image layers
Lsn(0x50),
)
@@ -6990,21 +6938,15 @@ mod tests {
key
}
// We create
// - one bottom-most image layer,
// - a delta layer D1 crossing the GC horizon with data below and above the horizon,
// - a delta layer D2 crossing the GC horizon with data only below the horizon,
// - a delta layer D3 above the horizon.
// We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
//
// | D3 |
// | D1 |
// | D1 | | D3 |
// -| |-- gc horizon -----------------
// | | | D2 |
// --------- img layer ------------------
//
// What we should expact from this compaction is:
// | D3 |
// | Part of D1 |
// | Part of D1 | | D3 |
// --------- img layer with D1+D2 at GC horizon------------------
// img layer at 0x10
@@ -7054,13 +6996,13 @@ mod tests {
let delta3 = vec![
(
get_key(8),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
),
(
get_key(9),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
Lsn(0x40),
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
),
];
@@ -7070,11 +7012,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
], // delta layers
vec![delta1, delta2, delta3], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
@@ -7089,7 +7027,6 @@ mod tests {
horizon: Lsn(0x30),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
@@ -7102,8 +7039,8 @@ mod tests {
Bytes::from_static(b"value 5@0x10@0x20"),
Bytes::from_static(b"value 6@0x10@0x20"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10@0x48"),
Bytes::from_static(b"value 9@0x10@0x48"),
Bytes::from_static(b"value 8@0x10@0x40"),
Bytes::from_static(b"value 9@0x10@0x40"),
];
let expected_result_at_gc_horizon = [

View File

@@ -6,20 +6,13 @@
//! is written as a one byte. If it's larger than that, the length
//! is written as a four-byte integer, in big-endian, with the high
//! bit set. This way, we can detect whether it's 1- or 4-byte header
//! by peeking at the first byte. For blobs larger than 128 bits,
//! we also specify three reserved bits, only one of the three bit
//! patterns is currently in use (0b011) and signifies compression
//! with zstd.
//! by peeking at the first byte.
//!
//! len < 128: 0XXXXXXX
//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use async_compression::Level;
use bytes::{BufMut, BytesMut};
use pageserver_api::models::ImageCompressionAlgorithm;
use tokio::io::AsyncWriteExt;
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
use tracing::warn;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
@@ -73,37 +66,12 @@ impl<'a> BlockCursor<'a> {
len_buf.copy_from_slice(&buf[off..off + 4]);
off += 4;
}
let bit_mask = if self.read_compressed {
!LEN_COMPRESSION_BIT_MASK
} else {
0x7f
};
len_buf[0] &= bit_mask;
len_buf[0] &= 0x7f;
u32::from_be_bytes(len_buf) as usize
};
let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
let mut tmp_buf = Vec::new();
let buf_to_write;
let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
if compression_bits > BYTE_UNCOMPRESSED {
warn!("reading key above future limit ({len} bytes)");
}
buf_to_write = dstbuf;
None
} else if compression_bits == BYTE_ZSTD {
buf_to_write = &mut tmp_buf;
Some(dstbuf)
} else {
let error = std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("invalid compression byte {compression_bits:x}"),
);
return Err(error);
};
buf_to_write.clear();
buf_to_write.reserve(len);
dstbuf.clear();
dstbuf.reserve(len);
// Read the payload
let mut remain = len;
@@ -117,35 +85,14 @@ impl<'a> BlockCursor<'a> {
page_remain = PAGE_SZ;
}
let this_blk_len = min(remain, page_remain);
buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
remain -= this_blk_len;
off += this_blk_len;
}
if let Some(dstbuf) = compression {
if compression_bits == BYTE_ZSTD {
let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
decoder.write_all(buf_to_write).await?;
decoder.flush().await?;
} else {
unreachable!("already checked above")
}
}
Ok(())
}
}
/// Reserved bits for length and compression
const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
/// The maximum size of blobs we support. The highest few bits
/// are reserved for compression and other further uses.
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
const BYTE_UNCOMPRESSED: u8 = 0x80;
const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
/// A wrapper of `VirtualFile` that allows users to write blobs.
///
/// If a `BlobWriter` is dropped, the internal buffer will be
@@ -180,11 +127,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
/// You need to make sure that the internal buffer is empty, otherwise
/// data will be written in wrong order.
#[inline(always)]
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
async fn write_all_unbuffered<Buf: IoBuf + Send>(
&mut self,
src_buf: B,
src_buf: Slice<Buf>,
ctx: &RequestContext,
) -> (B::Buf, Result<(), Error>) {
) -> (Slice<Buf>, Result<(), Error>) {
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
let nbytes = match res {
Ok(nbytes) => nbytes,
@@ -198,8 +145,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
/// Flushes the internal buffer to the underlying `VirtualFile`.
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
let buf = std::mem::take(&mut self.buf);
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
let (buf, res) = self.inner.write_all(buf.slice_full(), ctx).await;
res?;
let mut buf = Slice::into_inner(buf);
buf.clear();
self.buf = buf;
Ok(())
@@ -216,11 +164,20 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
}
/// Internal, possibly buffered, write function
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
async fn write_all<Buf: IoBuf + Send>(
&mut self,
src_buf: B,
src_buf: Slice<Buf>,
ctx: &RequestContext,
) -> (B::Buf, Result<(), Error>) {
) -> (Slice<Buf>, Result<(), Error>) {
let orig_bounds = src_buf.bounds();
macro_rules! return_orig_bounds {
($buf:expr, $val:expr) => {{
let buf = $buf.into_inner();
return (buf.slice(orig_bounds), $val);
}};
}
if !BUFFERED {
assert!(self.buf.is_empty());
return self.write_all_unbuffered(src_buf, ctx).await;
@@ -228,7 +185,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
let remaining = Self::CAPACITY - self.buf.len();
let src_buf_len = src_buf.bytes_init();
if src_buf_len == 0 {
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
return (src_buf, Ok(()));
}
let mut src_buf = src_buf.slice(0..src_buf_len);
// First try to copy as much as we can into the buffer
@@ -239,7 +196,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
// Then, if the buffer is full, flush it out
if self.buf.len() == Self::CAPACITY {
if let Err(e) = self.flush_buffer(ctx).await {
return (Slice::into_inner(src_buf), Err(e));
return_orig_bounds!(src_buf, Err(e));
}
}
// Finally, write the tail of src_buf:
@@ -252,118 +209,65 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
let copied = self.write_into_buffer(&src_buf);
// We just verified above that src_buf fits into our internal buffer.
assert_eq!(copied, src_buf.len());
Slice::into_inner(src_buf)
src_buf
} else {
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
if let Err(e) = res {
return (src_buf, Err(e));
return_orig_bounds!(src_buf, Err(e));
}
src_buf
}
} else {
Slice::into_inner(src_buf)
src_buf
};
(src_buf, Ok(()))
return_orig_bounds!(src_buf, Ok(()));
}
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
pub async fn write_blob<Buf: IoBuf + Send>(
&mut self,
srcbuf: B,
srcbuf: Slice<Buf>,
ctx: &RequestContext,
) -> (B::Buf, Result<u64, Error>) {
self.write_blob_maybe_compressed(
srcbuf,
ctx,
ImageCompressionAlgorithm::DisabledNoDecompress,
)
.await
}
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
srcbuf: B,
ctx: &RequestContext,
algorithm: ImageCompressionAlgorithm,
) -> (B::Buf, Result<u64, Error>) {
) -> (Slice<Buf>, Result<u64, Error>) {
let offset = self.offset;
let len = srcbuf.bytes_init();
let mut io_buf = self.io_buf.take().expect("we always put it back below");
io_buf.clear();
let mut compressed_buf = None;
let ((io_buf, hdr_res), srcbuf) = async {
let (io_buf, hdr_res) = async {
if len < 128 {
// Short blob. Write a 1-byte length header
io_buf.put_u8(len as u8);
(
self.write_all(io_buf, ctx).await,
srcbuf.slice_full().into_inner(),
)
self.write_all(io_buf.slice_full(), ctx).await
} else {
// Write a 4-byte length header
if len > MAX_SUPPORTED_LEN {
if len > 0x7fff_ffff {
return (
(
io_buf,
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
)),
),
srcbuf.slice_full().into_inner(),
io_buf.slice_full(),
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
)),
);
}
let (high_bit_mask, len_written, srcbuf) = match algorithm {
ImageCompressionAlgorithm::Zstd { level } => {
let mut encoder = if let Some(level) = level {
async_compression::tokio::write::ZstdEncoder::with_quality(
Vec::new(),
Level::Precise(level.into()),
)
} else {
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
};
let slice = srcbuf.slice_full();
encoder.write_all(&slice[..]).await.unwrap();
encoder.shutdown().await.unwrap();
let compressed = encoder.into_inner();
if compressed.len() < len {
let compressed_len = compressed.len();
compressed_buf = Some(compressed);
(BYTE_ZSTD, compressed_len, slice.into_inner())
} else {
(BYTE_UNCOMPRESSED, len, slice.into_inner())
}
}
ImageCompressionAlgorithm::Disabled
| ImageCompressionAlgorithm::DisabledNoDecompress => {
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
}
};
let mut len_buf = (len_written as u32).to_be_bytes();
assert_eq!(len_buf[0] & 0xf0, 0);
len_buf[0] |= high_bit_mask;
if len > 0x0fff_ffff {
tracing::warn!("writing blob above future limit ({len} bytes)");
}
let mut len_buf = (len as u32).to_be_bytes();
len_buf[0] |= 0x80;
io_buf.extend_from_slice(&len_buf[..]);
(self.write_all(io_buf, ctx).await, srcbuf)
self.write_all(io_buf.slice_full(), ctx).await
}
}
.await;
self.io_buf = Some(io_buf);
self.io_buf = Some(io_buf.into_inner());
match hdr_res {
Ok(_) => (),
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
Err(e) => return (srcbuf, Err(e)),
}
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
(Slice::into_inner(srcbuf.slice(..)), res)
} else {
self.write_all(srcbuf, ctx).await
};
let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
(srcbuf, res.map(|_| offset))
}
}
@@ -401,13 +305,6 @@ mod tests {
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
round_trip_test_compressed::<BUFFERED>(blobs, false).await
}
async fn round_trip_test_compressed<const BUFFERED: bool>(
blobs: &[Vec<u8>],
compression: bool,
) -> Result<(), Error> {
let temp_dir = camino_tempfile::tempdir()?;
let pathbuf = temp_dir.path().join("file");
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
@@ -418,22 +315,13 @@ mod tests {
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
wtr.write_blob_maybe_compressed(
blob.clone(),
&ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await
} else {
wtr.write_blob(blob.clone(), &ctx).await
};
let (_, res) = wtr.write_blob(blob.clone().slice_full(), &ctx).await;
let offs = res?;
offsets.push(offs);
}
// Write out one page worth of zeros so that we can
// read again with read_blk
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_full(), &ctx).await;
let offs = res?;
println!("Writing final blob at offs={offs}");
wtr.flush_buffer(&ctx).await?;
@@ -441,7 +329,7 @@ mod tests {
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new_with_compression(rdr, compression);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
let blob_read = rdr.read_blob(*offset, &ctx).await?;
assert_eq!(
@@ -475,8 +363,6 @@ mod tests {
];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
round_trip_test_compressed::<false>(blobs, true).await?;
round_trip_test_compressed::<true>(blobs, true).await?;
Ok(())
}
@@ -485,15 +371,10 @@ mod tests {
let blobs = &[
b"test".to_vec(),
random_array(10 * PAGE_SZ),
b"hello".to_vec(),
random_array(66 * PAGE_SZ),
vec![0xf3; 24 * PAGE_SZ],
b"foobar".to_vec(),
];
round_trip_test::<false>(blobs).await?;
round_trip_test::<true>(blobs).await?;
round_trip_test_compressed::<false>(blobs, true).await?;
round_trip_test_compressed::<true>(blobs, true).await?;
Ok(())
}

View File

@@ -37,7 +37,6 @@ where
pub enum BlockLease<'a> {
PageReadGuard(PageReadGuard<'static>),
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
Slice(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
#[cfg(test)]
@@ -64,7 +63,6 @@ impl<'a> Deref for BlockLease<'a> {
match self {
BlockLease::PageReadGuard(v) => v.deref(),
BlockLease::EphemeralFileMutableTail(v) => v,
BlockLease::Slice(v) => v,
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
#[cfg(test)]
@@ -83,7 +81,6 @@ pub(crate) enum BlockReaderRef<'a> {
FileBlockReader(&'a FileBlockReader<'a>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
Slice(&'a [u8]),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
#[cfg(test)]
@@ -102,7 +99,6 @@ impl<'a> BlockReaderRef<'a> {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
Slice(s) => Self::read_blk_slice(s, blknum),
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
@@ -111,24 +107,6 @@ impl<'a> BlockReaderRef<'a> {
}
}
impl<'a> BlockReaderRef<'a> {
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
let end = start.checked_add(PAGE_SZ).unwrap();
if end > slice.len() {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
format!("slice too short, len={} end={}", slice.len(), end),
));
}
let slice = &slice[start..end];
let page_sized: &[u8; PAGE_SZ] = slice
.try_into()
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
Ok(BlockLease::Slice(page_sized))
}
}
///
/// A "cursor" for efficiently reading multiple pages from a BlockReader
///
@@ -149,24 +127,16 @@ impl<'a> BlockReaderRef<'a> {
/// ```
///
pub struct BlockCursor<'a> {
pub(super) read_compressed: bool,
reader: BlockReaderRef<'a>,
}
impl<'a> BlockCursor<'a> {
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
Self::new_with_compression(reader, false)
}
pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
BlockCursor {
read_compressed,
reader,
}
BlockCursor { reader }
}
// Needed by cli
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
BlockCursor {
read_compressed: false,
reader: BlockReaderRef::FileBlockReader(reader),
}
}
@@ -196,25 +166,11 @@ pub struct FileBlockReader<'a> {
/// Unique ID of this file, used as key in the page cache.
file_id: page_cache::FileId,
compressed_reads: bool,
}
impl<'a> FileBlockReader<'a> {
pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
Self::new_with_compression(file, file_id, false)
}
pub fn new_with_compression(
file: &'a VirtualFile,
file_id: FileId,
compressed_reads: bool,
) -> Self {
FileBlockReader {
file_id,
file,
compressed_reads,
}
FileBlockReader { file_id, file }
}
/// Read a page from the underlying file into given buffer.
@@ -261,10 +217,7 @@ impl<'a> FileBlockReader<'a> {
impl BlockReader for FileBlockReader<'_> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new_with_compression(
BlockReaderRef::FileBlockReader(self),
self.compressed_reads,
)
BlockCursor::new(BlockReaderRef::FileBlockReader(self))
}
}

View File

@@ -21,7 +21,6 @@ pub struct EphemeralFile {
}
mod page_caching;
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
mod zero_padded_read_write;
impl EphemeralFile {
@@ -54,7 +53,7 @@ impl EphemeralFile {
Ok(EphemeralFile {
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
rw: page_caching::RW::new(file),
})
}
@@ -66,11 +65,6 @@ impl EphemeralFile {
self.rw.page_cache_file_id()
}
/// See [`self::page_caching::RW::load_to_vec`].
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
self.rw.load_to_vec(ctx).await
}
pub(crate) async fn read_blk(
&self,
blknum: u32,

View File

@@ -8,7 +8,6 @@ use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use std::io::{self, ErrorKind};
use std::ops::{Deref, Range};
use tokio_epoll_uring::BoundedBuf;
use tracing::*;
@@ -20,23 +19,14 @@ pub struct RW {
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
}
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
/// should we pre-warm the [`crate::page_cache`] with the contents?
#[derive(Clone, Copy)]
pub enum PrewarmOnWrite {
Yes,
No,
}
impl RW {
pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
pub fn new(file: VirtualFile) -> Self {
let page_cache_file_id = page_cache::next_file_id();
Self {
page_cache_file_id,
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
page_cache_file_id,
file,
prewarm_on_write,
)),
}
}
@@ -59,43 +49,6 @@ impl RW {
self.rw.bytes_written()
}
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
///
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
// round up to the next PAGE_SZ multiple, required by blob_io
let size = {
let s = usize::try_from(self.bytes_written()).unwrap();
if s % PAGE_SZ == 0 {
s
} else {
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
}
};
let vec = Vec::with_capacity(size);
// read from disk what we've already flushed
let writer = self.rw.as_writer();
let flushed_range = writer.written_range();
let mut vec = writer
.file
.read_exact_at(
vec.slice(0..(flushed_range.end - flushed_range.start)),
u64::try_from(flushed_range.start).unwrap(),
ctx,
)
.await?
.into_inner();
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
let buffered = self.rw.get_tail_zero_padded();
vec.extend_from_slice(buffered);
assert_eq!(vec.len(), size);
assert_eq!(vec.len() % PAGE_SZ, 0);
Ok(vec)
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
@@ -163,40 +116,19 @@ impl Drop for RW {
}
struct PreWarmingWriter {
prewarm_on_write: PrewarmOnWrite,
nwritten_blocks: u32,
page_cache_file_id: page_cache::FileId,
file: VirtualFile,
}
impl PreWarmingWriter {
fn new(
page_cache_file_id: page_cache::FileId,
file: VirtualFile,
prewarm_on_write: PrewarmOnWrite,
) -> Self {
fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
Self {
prewarm_on_write,
nwritten_blocks: 0,
page_cache_file_id,
file,
}
}
/// Return the byte range within `file` that has been written though `write_all`.
///
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
struct Wrapper(Range<usize>);
impl Deref for Wrapper {
type Target = Range<usize>;
fn deref(&self) -> &Range<usize> {
&self.0
}
}
Wrapper(0..nwritten_blocks * PAGE_SZ)
}
}
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -226,7 +158,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
let iobuf = match self.file.write_all(buf, ctx).await {
(iobuf, Ok(nwritten)) => {
assert_eq!(nwritten, buflen);
iobuf
iobuf.into_inner()
}
(_, Err(e)) => {
return Err(std::io::Error::new(
@@ -246,51 +178,45 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
assert_eq!(&check_bounds_stuff_works, &*buf);
}
// Pre-warm page cache with the contents.
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
// benefits the code that writes InMemoryLayer=>L0 layers.
let nblocks = buflen / PAGE_SZ;
let nblocks32 = u32::try_from(nblocks).unwrap();
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
// Pre-warm page cache with the contents.
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
// benefits the code that writes InMemoryLayer=>L0 layers.
let cache = page_cache::get();
static CTX: Lazy<RequestContext> = Lazy::new(|| {
RequestContext::new(
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
crate::context::DownloadBehavior::Error,
)
});
for blknum_in_buffer in 0..nblocks {
let blk_in_buffer =
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
let blknum = self
.nwritten_blocks
.checked_add(blknum_in_buffer as u32)
.unwrap();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
.await
{
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
Ok(v) => match v {
page_cache::ReadBufResult::Found(_guard) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
and this function takes &mut self, so, no concurrent read_blk is possible");
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(blk_in_buffer);
let _ = write_guard.mark_valid();
}
},
let cache = page_cache::get();
static CTX: Lazy<RequestContext> = Lazy::new(|| {
RequestContext::new(
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
crate::context::DownloadBehavior::Error,
)
});
for blknum_in_buffer in 0..nblocks {
let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
let blknum = self
.nwritten_blocks
.checked_add(blknum_in_buffer as u32)
.unwrap();
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
.await
{
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
Ok(v) => match v {
page_cache::ReadBufResult::Found(_guard) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
and this function takes &mut self, so, no concurrent read_blk is possible");
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
write_guard.copy_from_slice(blk_in_buffer);
let _ = write_guard.mark_valid();
}
},
}
}
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
Ok((buflen, buf.into_inner()))
}

View File

@@ -75,21 +75,6 @@ where
flushed_offset + u64::try_from(buffer.pending()).unwrap()
}
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
pub fn get_tail_zero_padded(&self) -> &[u8] {
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
let buffer_written_up_to = buffer.pending();
// pad to next page boundary
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
buffer_written_up_to
} else {
buffer_written_up_to
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
.unwrap()
};
&buffer.as_zero_padded_slice()[0..read_up_to]
}
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();

View File

@@ -43,8 +43,7 @@ use crate::tenant::config::{
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
use crate::virtual_file::MaybeFatalIo;
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
@@ -273,7 +272,7 @@ pub struct TenantManager {
}
fn emergency_generations(
tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantShardId, TenantStartupMode> {
tenant_confs
.iter()
@@ -297,7 +296,7 @@ fn emergency_generations(
async fn init_load_generations(
conf: &'static PageServerConf,
tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
resources: &TenantSharedResources,
cancel: &CancellationToken,
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -347,32 +346,56 @@ async fn init_load_generations(
/// Given a directory discovered in the pageserver's tenants/ directory, attempt
/// to load a tenant config from it.
///
/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
/// If file is missing, return Ok(None)
fn load_tenant_config(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
dentry: Utf8DirEntry,
) -> Option<Result<LocationConf, LoadConfigError>> {
) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
let tenant_dir_path = dentry.path().to_path_buf();
if crate::is_temporary(&tenant_dir_path) {
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
// No need to use safe_remove_tenant_dir_all because this is already
// a temporary path
std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
return None;
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path, e
);
}
return Ok(None);
}
// This case happens if we crash during attachment before writing a config into the dir
let is_empty = tenant_dir_path
.is_empty_dir()
.fatal_err("Checking for empty tenant dir");
.with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
return None;
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path
)
}
return Ok(None);
}
Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
let tenant_shard_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
.parse::<TenantShardId>()
{
Ok(id) => id,
Err(_) => {
warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
return Ok(None);
}
};
Ok(Some((
tenant_shard_id,
Tenant::load_tenant_config(conf, &tenant_shard_id),
)))
}
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -382,51 +405,32 @@ fn load_tenant_config(
/// seconds even on reasonably fast drives.
async fn init_load_tenant_configs(
conf: &'static PageServerConf,
) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
let tenants_dir = conf.tenants_path();
let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
let context = format!("read tenants dir {tenants_dir}");
let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
let dir_entries = tenants_dir
.read_dir_utf8()
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
dir_entries
.collect::<Result<Vec<_>, std::io::Error>>()
.fatal_err(&context)
Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
})
.await
.expect("Config load task panicked");
.await??;
let mut configs = HashMap::new();
let mut join_set = JoinSet::new();
for dentry in dentries {
let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): '{}'",
dentry.file_name()
);
continue;
}
};
join_set.spawn_blocking(move || {
(
tenant_shard_id,
load_tenant_config(conf, tenant_shard_id, dentry),
)
});
join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
}
while let Some(r) = join_set.join_next().await {
let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
if let Some(tenant_config) = tenant_config {
configs.insert(tenant_shard_id, tenant_config);
if let Some((tenant_id, tenant_config)) = r?? {
configs.insert(tenant_id, tenant_config);
}
}
configs
Ok(configs)
}
#[derive(Debug, thiserror::Error)]
@@ -468,7 +472,7 @@ pub async fn init_tenant_mgr(
);
// Scan local filesystem for attached tenants
let tenant_configs = init_load_tenant_configs(conf).await;
let tenant_configs = init_load_tenant_configs(conf).await?;
// Determine which tenants are to be secondary or attached, and in which generation
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -586,23 +590,31 @@ pub async fn init_tenant_mgr(
);
// For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
// Writing a config to local disk is foundational to startup up tenants: panic if we can't.
config_write_result.fatal_err("write tenant shard config file");
// Errors writing configs are fatal
config_write_result?;
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
let shard_identity = location_conf.shard;
let slot = match location_conf.mode {
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
&ctx,
)),
LocationMode::Attached(attached_conf) => {
match tenant_spawn(
conf,
tenant_shard_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
shard_identity,
Some(init_order.clone()),
SpawnMode::Lazy,
&ctx,
) {
Ok(tenant) => TenantSlot::Attached(tenant),
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
continue;
}
}
}
LocationMode::Secondary(secondary_conf) => {
info!(
tenant_id = %tenant_shard_id.tenant_id,
@@ -637,7 +649,8 @@ pub async fn init_tenant_mgr(
})
}
/// Wrapper for Tenant::spawn that checks invariants before running
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
/// a broken tenant in the map if Tenant::spawn fails.
#[allow(clippy::too_many_arguments)]
fn tenant_spawn(
conf: &'static PageServerConf,
@@ -649,18 +662,23 @@ fn tenant_spawn(
init_order: Option<InitializationOrder>,
mode: SpawnMode,
ctx: &RequestContext,
) -> Arc<Tenant> {
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
// to avoid impacting prod runtime performance.
assert!(!crate::is_temporary(tenant_path));
debug_assert!(tenant_path.is_dir());
debug_assert!(conf
.tenant_location_config_path(&tenant_shard_id)
.try_exists()
.unwrap());
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
tenant_path.is_dir(),
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
);
anyhow::ensure!(
!crate::is_temporary(tenant_path),
"Cannot load tenant from temporary path {tenant_path:?}"
);
anyhow::ensure!(
!tenant_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_path:?} is an empty dir")
})?,
"Cannot load tenant from empty directory {tenant_path:?}"
);
Tenant::spawn(
let tenant = Tenant::spawn(
conf,
tenant_shard_id,
resources,
@@ -669,7 +687,9 @@ fn tenant_spawn(
init_order,
mode,
ctx,
)
);
Ok(tenant)
}
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -820,9 +840,8 @@ pub(crate) enum UpsertLocationError {
#[error("Failed to flush: {0}")]
Flush(anyhow::Error),
/// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
#[error("Internal error: {0}")]
InternalError(anyhow::Error),
Other(#[from] anyhow::Error),
}
impl TenantManager {
@@ -952,8 +971,7 @@ impl TenantManager {
match fast_path_taken {
Some(FastPathModified::Attached(tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.fatal_err("write tenant shard config");
.await?;
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
@@ -983,8 +1001,7 @@ impl TenantManager {
}
Some(FastPathModified::Secondary(_secondary_tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.fatal_err("write tenant shard config");
.await?;
return Ok(None);
}
@@ -1050,7 +1067,7 @@ impl TenantManager {
Some(TenantSlot::InProgress(_)) => {
// This should never happen: acquire_slot should error out
// if the contents of a slot were InProgress.
return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
return Err(UpsertLocationError::Other(anyhow::anyhow!(
"Acquired an InProgress slot, this is a bug."
)));
}
@@ -1069,14 +1086,12 @@ impl TenantManager {
// Does not need to be fsync'd because local storage is just a cache.
tokio::fs::create_dir_all(&timelines_path)
.await
.fatal_err("create timelines/ dir");
.with_context(|| format!("Creating {timelines_path}"))?;
// Before activating either secondary or attached mode, persist the
// configuration, so that on restart we will re-attach (or re-start
// secondary) on the tenant.
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.fatal_err("write tenant shard config");
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(secondary_config) => {
@@ -1095,15 +1110,13 @@ impl TenantManager {
// from upserts. This enables creating generation-less tenants even though neon_local
// always uses generations when calling the location conf API.
let attached_conf = if cfg!(feature = "testing") {
let mut conf = AttachedTenantConf::try_from(new_location_config)
.map_err(UpsertLocationError::BadRequest)?;
let mut conf = AttachedTenantConf::try_from(new_location_config)?;
if self.conf.control_plane_api.is_none() {
conf.location.generation = Generation::none();
}
conf
} else {
AttachedTenantConf::try_from(new_location_config)
.map_err(UpsertLocationError::BadRequest)?
AttachedTenantConf::try_from(new_location_config)?
};
let tenant = tenant_spawn(
@@ -1116,7 +1129,7 @@ impl TenantManager {
None,
spawn_mode,
ctx,
);
)?;
TenantSlot::Attached(tenant)
}
@@ -1130,7 +1143,7 @@ impl TenantManager {
match slot_guard.upsert(new_slot) {
Err(TenantSlotUpsertError::InternalError(e)) => {
Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
}
Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1237,7 +1250,7 @@ impl TenantManager {
None,
SpawnMode::Eager,
ctx,
);
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1971,7 +1984,7 @@ impl TenantManager {
None,
SpawnMode::Eager,
ctx,
);
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;

View File

@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
) -> anyhow::Result<u64> {
let downloaded_size = {
let _unfinished_gauge_guard = self.metrics.call_begin(
&RemoteOpFileKind::Layer,

View File

@@ -23,8 +23,6 @@ use super::{
storage_layer::LayerName,
};
use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
use metrics::UIntGauge;
use pageserver_api::{
models,
shard::{ShardIdentity, TenantShardId},
@@ -101,17 +99,6 @@ pub(crate) struct SecondaryTenant {
// Public state indicating overall progress of downloads relative to the last heatmap seen
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
// Sum of layer sizes on local disk
pub(super) resident_size_metric: UIntGauge,
}
impl Drop for SecondaryTenant {
fn drop(&mut self) {
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
}
}
impl SecondaryTenant {
@@ -121,12 +108,6 @@ impl SecondaryTenant {
tenant_conf: TenantConfOpt,
config: &SecondaryLocationConfig,
) -> Arc<Self> {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id])
.unwrap();
Arc::new(Self {
tenant_shard_id,
// todo: shall we make this a descendent of the
@@ -142,8 +123,6 @@ impl SecondaryTenant {
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
progress: std::sync::Mutex::default(),
resident_size_metric,
})
}
@@ -232,12 +211,16 @@ impl SecondaryTenant {
// have to 100% match what is on disk, because it's a best-effort warming
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(removed) =
detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
{
// We might race with removal of the same layer during downloads, so finding the layer we
// were trying to remove is optional. Only issue the disk I/O to remove it if we found it.
removed.remove_blocking();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
let removed = timeline_detail.on_disk_layers.remove(&name);
// We might race with removal of the same layer during downloads, if it was removed
// from the heatmap. If we see that the OnDiskState is gone, then no need to
// do a physical deletion or store in evicted_at.
if let Some(removed) = removed {
removed.remove_blocking();
timeline_detail.evicted_at.insert(name, now);
}
}
})
.await

View File

@@ -46,7 +46,6 @@ use crate::tenant::{
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use metrics::UIntGauge;
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -132,66 +131,16 @@ impl OnDiskState {
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting secondary layer")
}
pub(crate) fn file_size(&self) -> u64 {
self.metadata.file_size
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
on_disk_layers: HashMap<LayerName, OnDiskState>,
pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
}
impl SecondaryDetailTimeline {
pub(super) fn remove_layer(
&mut self,
name: &LayerName,
resident_metric: &UIntGauge,
) -> Option<OnDiskState> {
let removed = self.on_disk_layers.remove(name);
if let Some(removed) = &removed {
resident_metric.sub(removed.file_size());
}
removed
}
/// `local_path`
fn touch_layer<F>(
&mut self,
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
touched: &HeatMapLayer,
resident_metric: &UIntGauge,
local_path: F,
) where
F: FnOnce() -> Utf8PathBuf,
{
use std::collections::hash_map::Entry;
match self.on_disk_layers.entry(touched.name.clone()) {
Entry::Occupied(mut v) => {
v.get_mut().access_time = touched.access_time;
}
Entry::Vacant(e) => {
e.insert(OnDiskState::new(
conf,
tenant_shard_id,
timeline_id,
touched.name.clone(),
touched.metadata.clone(),
touched.access_time,
local_path(),
));
resident_metric.add(touched.metadata.file_size);
}
}
}
}
// Aspects of a heatmap that we remember after downloading it
#[derive(Clone, Debug)]
struct DownloadSummary {
@@ -209,7 +158,7 @@ pub(super) struct SecondaryDetail {
last_download: Option<DownloadSummary>,
next_download: Option<Instant>,
timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
/// Helper for logging SystemTime
@@ -242,38 +191,6 @@ impl SecondaryDetail {
}
}
pub(super) fn evict_layer(
&mut self,
name: LayerName,
timeline_id: &TimelineId,
now: SystemTime,
resident_metric: &UIntGauge,
) -> Option<OnDiskState> {
let timeline = self.timelines.get_mut(timeline_id)?;
let removed = timeline.remove_layer(&name, resident_metric);
if removed.is_some() {
timeline.evicted_at.insert(name, now);
}
removed
}
pub(super) fn remove_timeline(
&mut self,
timeline_id: &TimelineId,
resident_metric: &UIntGauge,
) {
let removed = self.timelines.remove(timeline_id);
if let Some(removed) = removed {
resident_metric.sub(
removed
.on_disk_layers
.values()
.map(|l| l.metadata.file_size)
.sum(),
);
}
}
/// Additionally returns the total number of layers, used for more stable relative access time
/// based eviction.
pub(super) fn get_layers_for_eviction(
@@ -684,13 +601,8 @@ impl<'a> TenantDownloader<'a> {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state = init_timeline_state(
self.conf,
tenant_shard_id,
timeline,
&self.secondary_state.resident_size_metric,
)
.await;
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
@@ -759,25 +671,6 @@ impl<'a> TenantDownloader<'a> {
.await?;
}
// Metrics consistency check in testing builds
if cfg!(feature = "testing") {
let detail = self.secondary_state.detail.lock().unwrap();
let resident_size = detail
.timelines
.values()
.map(|tl| {
tl.on_disk_layers
.values()
.map(|v| v.metadata.file_size)
.sum::<u64>()
})
.sum::<u64>();
assert_eq!(
resident_size,
self.secondary_state.resident_size_metric.get()
);
}
// Only update last_etag after a full successful download: this way will not skip
// the next download, even if the heatmap's actual etag is unchanged.
self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -890,7 +783,7 @@ impl<'a> TenantDownloader<'a> {
for delete_timeline in &delete_timelines {
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
// from disk fails that will be a fatal error.
detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
detail.timelines.remove(delete_timeline);
}
}
@@ -908,7 +801,7 @@ impl<'a> TenantDownloader<'a> {
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
continue;
};
timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
timeline_state.on_disk_layers.remove(&layer_name);
}
for timeline_id in delete_timelines {
@@ -1107,24 +1000,33 @@ impl<'a> TenantDownloader<'a> {
let timeline_detail = detail.timelines.entry(timeline_id).or_default();
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
touched.into_iter().for_each(|t| {
timeline_detail.touch_layer(
self.conf,
tenant_shard_id,
&timeline_id,
&t,
&self.secondary_state.resident_size_metric,
|| {
local_layer_path(
for t in touched {
use std::collections::hash_map::Entry;
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
Entry::Occupied(mut v) => {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline_id,
&t.name,
&t.metadata.generation,
)
},
)
});
);
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
&timeline_id,
t.name,
t.metadata.clone(),
t.access_time,
local_path,
));
}
}
}
}
result
@@ -1233,7 +1135,6 @@ async fn init_timeline_state(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
heatmap: &HeatMapTimeline,
resident_metric: &UIntGauge,
) -> SecondaryDetailTimeline {
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
let mut detail = SecondaryDetailTimeline::default();
@@ -1309,13 +1210,17 @@ async fn init_timeline_state(
} else {
// We expect the access time to be initialized immediately afterwards, when
// the latest heatmap is applied to the state.
detail.touch_layer(
conf,
tenant_shard_id,
&heatmap.timeline_id,
remote_meta,
resident_metric,
|| file_path,
detail.on_disk_layers.insert(
name.clone(),
OnDiskState::new(
conf,
tenant_shard_id,
&heatmap.timeline_id,
name,
remote_meta.metadata.clone(),
remote_meta.access_time,
file_path,
),
);
}
}

View File

@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use tenant_size_model::svg::SvgBranchKind;
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
@@ -88,9 +87,6 @@ impl SegmentMeta {
LsnKind::BranchPoint => true,
LsnKind::GcCutOff => true,
LsnKind::BranchEnd => false,
LsnKind::LeasePoint => true,
LsnKind::LeaseStart => false,
LsnKind::LeaseEnd => false,
}
}
}
@@ -107,21 +103,6 @@ pub enum LsnKind {
GcCutOff,
/// Last record LSN
BranchEnd,
/// A LSN lease is granted here.
LeasePoint,
/// A lease starts from here.
LeaseStart,
/// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
LeaseEnd,
}
impl From<LsnKind> for SvgBranchKind {
fn from(kind: LsnKind) -> Self {
match kind {
LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
_ => SvgBranchKind::Timeline,
}
}
}
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -143,9 +124,6 @@ pub struct TimelineInputs {
/// Cutoff point calculated from the user-supplied 'max_retention_period'
retention_param_cutoff: Option<Lsn>,
/// Lease points on the timeline
lease_points: Vec<Lsn>,
}
/// Gathers the inputs for the tenant sizing model.
@@ -256,13 +234,6 @@ pub(super) async fn gather_inputs(
None
};
let lease_points = gc_info
.leases
.keys()
.filter(|&&lsn| lsn > ancestor_lsn)
.copied()
.collect::<Vec<_>>();
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -277,8 +248,6 @@ pub(super) async fn gather_inputs(
.map(|lsn| (lsn, LsnKind::BranchPoint))
.collect::<Vec<_>>();
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
drop(gc_info);
// Add branch points we collected earlier, just in case there were any that were
@@ -327,7 +296,6 @@ pub(super) async fn gather_inputs(
if kind == LsnKind::BranchPoint {
branchpoint_segments.insert((timeline_id, lsn), segments.len());
}
segments.push(SegmentMeta {
segment: Segment {
parent: Some(parent),
@@ -338,45 +306,7 @@ pub(super) async fn gather_inputs(
timeline_id: timeline.timeline_id,
kind,
});
parent = segments.len() - 1;
if kind == LsnKind::LeasePoint {
// Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
// (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
// value. Without the other two segments, the calculation code would not count the leased LSN as a point
// to be retained.
// Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
//
// Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
// branch points can be given a synthetic id so we can unite them.
let mut lease_parent = parent;
// Start of a lease.
segments.push(SegmentMeta {
segment: Segment {
parent: Some(lease_parent),
lsn: lsn.0,
size: None, // Filled in later, if necessary
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
},
timeline_id: timeline.timeline_id,
kind: LsnKind::LeaseStart,
});
lease_parent += 1;
// End of the lease.
segments.push(SegmentMeta {
segment: Segment {
parent: Some(lease_parent),
lsn: lsn.0,
size: None, // Filled in later, if necessary
needed: true, // everything at the lease LSN must be readable => is needed
},
timeline_id: timeline.timeline_id,
kind: LsnKind::LeaseEnd,
});
}
parent += 1;
}
// Current end of the timeline
@@ -402,7 +332,6 @@ pub(super) async fn gather_inputs(
pitr_cutoff,
next_gc_cutoff,
retention_param_cutoff,
lease_points,
});
}
@@ -745,8 +674,7 @@ fn verify_size_for_multiple_branches() {
"horizon_cutoff": "0/2210CD0",
"pitr_cutoff": "0/2210CD0",
"next_gc_cutoff": "0/2210CD0",
"retention_param_cutoff": null,
"lease_points": []
"retention_param_cutoff": null
},
{
"timeline_id": "454626700469f0a9914949b9d018e876",
@@ -756,8 +684,7 @@ fn verify_size_for_multiple_branches() {
"horizon_cutoff": "0/1817770",
"pitr_cutoff": "0/1817770",
"next_gc_cutoff": "0/1817770",
"retention_param_cutoff": null,
"lease_points": []
"retention_param_cutoff": null
},
{
"timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -767,8 +694,7 @@ fn verify_size_for_multiple_branches() {
"horizon_cutoff": "0/18B3D98",
"pitr_cutoff": "0/18B3D98",
"next_gc_cutoff": "0/18B3D98",
"retention_param_cutoff": null,
"lease_points": []
"retention_param_cutoff": null
}
]
}
@@ -823,8 +749,7 @@ fn verify_size_for_one_branch() {
"horizon_cutoff": "47/240A5860",
"pitr_cutoff": "47/240A5860",
"next_gc_cutoff": "47/240A5860",
"retention_param_cutoff": "0/0",
"lease_points": []
"retention_param_cutoff": "0/0"
}
]
}"#;

View File

@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -60,6 +60,7 @@ use std::os::unix::fs::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio_epoll_uring::{BoundedBuf as _, Slice};
use tracing::*;
use utils::{
@@ -452,12 +453,8 @@ impl DeltaLayerWriterInner {
ctx: &RequestContext,
) -> (Vec<u8>, anyhow::Result<()>) {
assert!(self.lsn_range.start <= lsn);
// We don't want to use compression in delta layer creation
let compression = ImageCompressionAlgorithm::DisabledNoDecompress;
let (val, res) = self
.blob_writer
.write_blob_maybe_compressed(val, ctx, compression)
.await;
let (val, res) = self.blob_writer.write_blob(val.slice_full(), ctx).await;
let val = Slice::into_inner(val);
let off = match res {
Ok(off) => off,
Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -510,7 +507,7 @@ impl DeltaLayerWriterInner {
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
.await?;
for buf in block_buf.blocks {
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
}
assert!(self.lsn_range.start < self.lsn_range.end);
@@ -530,7 +527,7 @@ impl DeltaLayerWriterInner {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&summary, &mut buf)?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
let metadata = file
@@ -735,7 +732,7 @@ impl DeltaLayer {
// TODO: could use smallvec here, but it's a pain with Slice<T>
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
Ok(())
}

View File

@@ -165,7 +165,6 @@ pub struct ImageLayerInner {
file_id: FileId,
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
compressed_reads: bool,
}
impl std::fmt::Debug for ImageLayerInner {
@@ -179,8 +178,7 @@ impl std::fmt::Debug for ImageLayerInner {
impl ImageLayerInner {
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
@@ -268,10 +266,9 @@ impl ImageLayer {
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
let path = self.path();
let loaded =
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx)
.await
.and_then(|res| res)?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
.await
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
@@ -344,6 +341,7 @@ impl ImageLayer {
where
F: Fn(Summary) -> Summary,
{
use tokio_epoll_uring::BoundedBuf as _;
let mut file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
@@ -365,7 +363,7 @@ impl ImageLayer {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_buf, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
Ok(())
}
@@ -380,7 +378,6 @@ impl ImageLayerInner {
lsn: Lsn,
summary: Option<Summary>,
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
support_compressed_reads: bool,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path, ctx).await {
@@ -424,7 +421,6 @@ impl ImageLayerInner {
file,
file_id,
max_vectored_read_bytes,
compressed_reads: support_compressed_reads,
key_range: actual_summary.key_range,
}))
}
@@ -435,8 +431,7 @@ impl ImageLayerInner {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
@@ -496,14 +491,12 @@ impl ImageLayerInner {
&self,
ctx: &RequestContext,
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
let mut result = Vec::new();
let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let cursor = block_reader.block_cursor();
while let Some(item) = stream.next().await {
// TODO: dedup code with get_reconstruct_value
@@ -538,8 +531,7 @@ impl ImageLayerInner {
.into(),
);
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
@@ -700,8 +692,7 @@ impl ImageLayerInner {
#[cfg(test)]
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
let block_reader =
FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads);
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
ImageLayerIterator {
@@ -808,8 +799,9 @@ impl ImageLayerWriterInner {
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
use tokio_epoll_uring::BoundedBuf as _;
ensure!(self.key_range.contains(&key));
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
let (_img, res) = self.blob_writer.write_blob(img.slice_full(), ctx).await;
// TODO: re-use the buffer for `img` further upstack
let off = res?;
@@ -828,6 +820,7 @@ impl ImageLayerWriterInner {
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<ResidentLayer> {
use tokio_epoll_uring::BoundedBuf as _;
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -838,7 +831,7 @@ impl ImageLayerWriterInner {
.await?;
let (index_root_blk, block_buf) = self.tree.finish()?;
for buf in block_buf.blocks {
let (_buf, res) = file.write_all(buf, ctx).await;
let (_slice, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
}
@@ -858,7 +851,7 @@ impl ImageLayerWriterInner {
// TODO: could use smallvec here but it's a pain with Slice<T>
Summary::ser_into(&summary, &mut buf)?;
file.seek(SeekFrom::Start(0)).await?;
let (_buf, res) = file.write_all(buf, ctx).await;
let (_slice, res) = file.write_all(buf.slice_full(), ctx).await;
res?;
let metadata = file

View File

@@ -6,14 +6,13 @@
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value};
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::ValueReconstructResult;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
use crate::{l0_flush, page_cache, walrecord};
use crate::{page_cache, walrecord};
use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
@@ -411,7 +410,6 @@ impl InMemoryLayer {
continue;
}
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
if let Err(e) = buf {
reconstruct_state
@@ -622,13 +620,6 @@ impl InMemoryLayer {
// rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().await;
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
use l0_flush::Inner;
let _concurrency_permit = match &*l0_flush_global_state {
Inner::PageCached => None,
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
};
let end_lsn = *self.end_lsn.get().unwrap();
let key_count = if let Some(key_range) = key_range {
@@ -654,83 +645,28 @@ impl InMemoryLayer {
)
.await?;
match &*l0_flush_global_state {
l0_flush::Inner::PageCached => {
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let mut buf = Vec::new();
let mut buf = Vec::new();
let cursor = inner.file.block_cursor();
let cursor = inner.file.block_cursor();
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let res;
(buf, res) = delta_layer_writer
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
.await;
res?;
}
}
}
l0_flush::Inner::Direct { .. } => {
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
assert_eq!(
file_contents.len() % PAGE_SZ,
0,
"needed by BlockReaderRef::Slice"
);
assert_eq!(file_contents.len(), {
let written = usize::try_from(inner.file.len()).unwrap();
if written % PAGE_SZ == 0 {
written
} else {
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
}
});
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
let mut buf = Vec::new();
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
// TODO: once we have blob lengths in the in-memory index, we can
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
// 2. load the file contents into a Bytes and
// 3. the use `Bytes::slice` to get the `buf` that is our blob
// 4. pass that `buf` into `put_value_bytes`
// => https://github.com/neondatabase/neon/issues/8183
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let res;
(buf, res) = delta_layer_writer
.put_value_bytes(*key, *lsn, buf, will_init, ctx)
.await;
res?;
}
}
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
let will_init = Value::des(&buf)?.will_init();
let res;
(buf, res) = delta_layer_writer
.put_value_bytes(*key, *lsn, buf, will_init, &ctx)
.await;
res?;
}
}
// MAX is used here because we identify L0 layers by full key range
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
//
// If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
// the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
// Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
//
// We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
drop(_concurrency_permit);
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
Ok(Some(delta_layer))
}
}

View File

@@ -1096,10 +1096,19 @@ impl LayerInner {
match rx.await {
Ok(Ok(res)) => Ok(res),
Ok(Err(remote_storage::DownloadError::Cancelled)) => {
Err(DownloadError::DownloadCancelled)
Ok(Err(e)) => {
// sleep already happened in the spawned task, if it was not cancelled
match e.downcast_ref::<remote_storage::DownloadError>() {
// If the download failed due to its cancellation token,
// propagate the cancellation error upstream.
Some(remote_storage::DownloadError::Cancelled) => {
Err(DownloadError::DownloadCancelled)
}
// FIXME: this is not embedding the error because historically it would had
// been output to compute, however that is no longer the case.
_ => Err(DownloadError::DownloadFailed),
}
}
Ok(Err(_)) => Err(DownloadError::DownloadFailed),
Err(_gone) => Err(DownloadError::DownloadCancelled),
}
}
@@ -1109,7 +1118,7 @@ impl LayerInner {
timeline: Arc<Timeline>,
permit: heavier_once_cell::InitPermit,
ctx: &RequestContext,
) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
) -> anyhow::Result<Arc<DownloadedLayer>> {
let result = timeline
.remote_client
.download_layer_file(
@@ -1685,7 +1694,6 @@ impl DownloadedLayer {
lsn,
summary,
Some(owner.conf.max_vectored_read_bytes),
owner.conf.image_compression.allow_decompression(),
ctx,
)
.await

View File

@@ -14,7 +14,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use arc_swap::ArcSwap;
use bytes::Bytes;
use camino::Utf8Path;
use chrono::{DateTime, Utc};
use enumset::EnumSet;
use fail::fail_point;
use once_cell::sync::Lazy;
@@ -66,6 +65,7 @@ use std::{
ops::{Deref, Range},
};
use crate::metrics::GetKind;
use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
use crate::{
aux_file::AuxFileSizeEstimator,
@@ -90,10 +90,6 @@ use crate::{
use crate::{
disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
};
use crate::{
l0_flush::{self, L0FlushGlobalState},
metrics::GetKind,
};
use crate::{
metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
};
@@ -212,7 +208,6 @@ pub struct TimelineResources {
pub timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
pub(crate) struct AuxFilesState {
@@ -365,7 +360,6 @@ pub struct Timeline {
repartition_threshold: u64,
last_image_layer_creation_check_at: AtomicLsn,
last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
/// Current logical size of the "datadir", at the last LSN.
current_logical_size: LogicalSize,
@@ -439,8 +433,6 @@ pub struct Timeline {
/// in the future, add `extra_test_sparse_keyspace` if necessary.
#[cfg(test)]
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
pub(crate) l0_flush_global_state: L0FlushGlobalState,
}
pub struct WalReceiverInfo {
@@ -465,9 +457,6 @@ pub(crate) struct GcInfo {
/// Leases granted to particular LSNs.
pub(crate) leases: BTreeMap<Lsn, LsnLease>,
/// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
pub(crate) within_ancestor_pitr: bool,
}
impl GcInfo {
@@ -728,9 +717,6 @@ impl From<CreateImageLayersError> for CompactionError {
fn from(e: CreateImageLayersError) -> Self {
match e {
CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
CreateImageLayersError::Other(e) => {
CompactionError::Other(e.context("create image layers"))
}
_ => CompactionError::Other(e.into()),
}
}
@@ -859,18 +845,6 @@ impl Timeline {
.map(|ancestor| ancestor.timeline_id)
}
/// Get the bytes written since the PITR cutoff on this branch, and
/// whether this branch's ancestor_lsn is within its parent's PITR.
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.pitr)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
}
/// Lock and get timeline's GC cutoff
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
@@ -1022,7 +996,6 @@ impl Timeline {
}
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
/// Look up multiple page versions at a given LSN
///
@@ -1255,7 +1228,7 @@ impl Timeline {
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(get_kind)
.start_timer();
self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
.await?;
get_data_timer.stop_and_record();
@@ -1285,25 +1258,11 @@ impl Timeline {
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() {
let avg = layers_visited as f64 / results.len() as f64;
if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
let mut rate_limit = LOGGED.lock().unwrap();
rate_limit.call(|| {
tracing::info!(
shard_id = %self.tenant_shard_id.shard_slug(),
lsn = %lsn,
"Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
});
}
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
}
Ok(results)
@@ -1595,13 +1554,7 @@ impl Timeline {
let existing_lease = occupied.get_mut();
if valid_until > existing_lease.valid_until {
existing_lease.valid_until = valid_until;
let dt: DateTime<Utc> = valid_until.into();
info!("lease extended to {}", dt);
} else {
let dt: DateTime<Utc> = existing_lease.valid_until.into();
info!("existing lease covers greater length, valid until {}", dt);
}
existing_lease.clone()
} else {
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1610,8 +1563,6 @@ impl Timeline {
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
let dt: DateTime<Utc> = valid_until.into();
info!("lease created, valid until {}", dt);
entry.or_insert(LsnLease { valid_until }).clone()
}
};
@@ -2388,7 +2339,6 @@ impl Timeline {
)),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
last_image_layer_creation_check_instant: Mutex::new(None),
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(RelSizeCache {
@@ -2426,8 +2376,6 @@ impl Timeline {
#[cfg(test)]
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
l0_flush_global_state: resources.l0_flush_global_state,
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -4469,58 +4417,6 @@ impl Timeline {
}
}
/// Predicate function which indicates whether we should check if new image layers
/// are required. Since checking if new image layers are required is expensive in
/// terms of CPU, we only do it in the following cases:
/// 1. If the timeline has ingested sufficient WAL to justify the cost
/// 2. If enough time has passed since the last check
/// 2.1. For large tenants, we wish to perform the check more often since they
/// suffer from the lack of image layers
/// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
let last_checks_at = self.last_image_layer_creation_check_at.load();
let distance = lsn
.checked_sub(last_checks_at)
.expect("Attempt to compact with LSN going backwards");
let min_distance =
self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
let distance_based_decision = distance.0 >= min_distance;
let mut time_based_decision = false;
let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
{
self.get_checkpoint_timeout()
} else {
Duration::from_secs(3600 * 48)
};
time_based_decision = match *last_check_instant {
Some(last_check) => {
let elapsed = last_check.elapsed();
elapsed >= check_required_after
}
None => true,
};
}
// Do the expensive delta layer counting only if this timeline has ingested sufficient
// WAL since the last check or a checkpoint timeout interval has elapsed since the last
// check.
let decision = distance_based_decision || time_based_decision;
if decision {
self.last_image_layer_creation_check_at.store(lsn);
*last_check_instant = Some(Instant::now());
}
decision
}
#[tracing::instrument(skip_all, fields(%lsn, %mode))]
async fn create_image_layers(
self: &Arc<Timeline>,
@@ -4543,7 +4439,22 @@ impl Timeline {
// image layers <100000000..100000099> and <200000000..200000199> are not completely covering it.
let mut start = Key::MIN;
let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
let check_for_image_layers = {
let last_checks_at = self.last_image_layer_creation_check_at.load();
let distance = lsn
.checked_sub(last_checks_at)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting if this timeline has not ingested sufficient
// WAL since the last check.
distance.0 >= min_distance
};
if check_for_image_layers {
self.last_image_layer_creation_check_at.store(lsn);
}
for partition in partitioning.parts.iter() {
let img_range = start..partition.ranges.last().unwrap().end;
@@ -4800,42 +4711,6 @@ impl DurationRecorder {
}
}
/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
/// the layer descriptor requires the user to provide the ranges, which should cover all
/// keys specified in the `data` field.
#[cfg(test)]
pub struct DeltaLayerTestDesc {
pub lsn_range: Range<Lsn>,
pub key_range: Range<Key>,
pub data: Vec<(Key, Lsn, Value)>,
}
#[cfg(test)]
impl DeltaLayerTestDesc {
#[allow(dead_code)]
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
Self {
lsn_range,
key_range,
data,
}
}
pub fn new_with_inferred_key_range(
lsn_range: Range<Lsn>,
data: Vec<(Key, Lsn, Value)>,
) -> Self {
let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
Self {
key_range: (*key_min)..(key_max.next()),
lsn_range,
data,
}
}
}
impl Timeline {
async fn finish_compact_batch(
self: &Arc<Self>,
@@ -5636,65 +5511,37 @@ impl Timeline {
#[cfg(test)]
pub(super) async fn force_create_delta_layer(
self: &Arc<Timeline>,
mut deltas: DeltaLayerTestDesc,
mut deltas: Vec<(Key, Lsn, Value)>,
check_start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let last_record_lsn = self.get_last_record_lsn();
deltas
.data
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
for (_, lsn, _) in &deltas.data {
assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
}
deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
assert!(
deltas.lsn_range.end <= last_record_lsn,
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
deltas.lsn_range.end,
last_record_lsn
max_lsn <= last_record_lsn,
"advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
);
let end_lsn = Lsn(max_lsn.0 + 1);
if let Some(check_start_lsn) = check_start_lsn {
assert!(deltas.lsn_range.start >= check_start_lsn);
}
// check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
// layers of the same start/end LSN, and so should the force inserted layer
{
/// Checks if a overlaps with b, assume a/b = [start, end).
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
let guard = self.layers.read().await;
for layer in guard.layer_map().iter_historic_layers() {
if layer.is_delta()
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
&& layer.lsn_range != deltas.lsn_range
{
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
panic!(
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
);
}
}
assert!(min_lsn >= check_start_lsn);
}
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
deltas.key_range.start,
deltas.lsn_range,
min_key,
min_lsn..end_lsn,
ctx,
)
.await?;
for (key, lsn, val) in deltas.data {
for (key, lsn, val) in deltas {
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
}
let delta_layer = delta_layer_writer
.finish(deltas.key_range.end, self, ctx)
.await?;
let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?;
{
let mut guard = self.layers.write().await;

View File

@@ -272,7 +272,6 @@ impl DeleteTimelineFlow {
TimelineResources {
remote_client,
timeline_get_throttle: tenant.timeline_get_throttle.clone(),
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
},
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,9 +208,14 @@ pub(super) async fn handle_walreceiver_connection(
.instrument(tracing::info_span!("poller")),
);
let _guard = LIVE_CONNECTIONS
.with_label_values(&["wal_receiver"])
.guard();
// Immediately increment the gauge, then create a job to decrement it on task exit.
// One of the pros of `defer!` is that this will *most probably*
// get called, even in presence of panics.
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
let identify = identify_system(&replication_client).await?;
info!("{identify:?}");

View File

@@ -671,17 +671,27 @@ impl VirtualFile {
}
/// Writes `buf.slice(0..buf.bytes_init())`.
/// Returns the IoBuf that is underlying the BoundedBuf `buf`.
/// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
/// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
pub async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
///
/// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
/// We also return the amount of written data in the `Ok()` variant (although we could also have encountered
/// before an `Err()`).
pub async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: B,
buf: Slice<Buf>,
ctx: &RequestContext,
) -> (B::Buf, Result<usize, Error>) {
) -> (Slice<Buf>, Result<usize, Error>) {
let begin_end = buf.bounds();
macro_rules! return_orig_bounds {
($buf:expr, $val:expr) => {{
let buf = $buf.into_inner();
return (buf.slice(begin_end), $val);
}};
}
let nbytes = buf.bytes_init();
if nbytes == 0 {
return (Slice::into_inner(buf.slice_full()), Ok(0));
return (buf, Ok(0));
}
let mut buf = buf.slice(0..nbytes);
while !buf.is_empty() {
@@ -689,22 +699,23 @@ impl VirtualFile {
(buf, res) = self.write(buf, ctx).await;
match res {
Ok(0) => {
return (
Slice::into_inner(buf),
return_orig_bounds!(
buf,
Err(Error::new(
std::io::ErrorKind::WriteZero,
"failed to write whole buffer",
)),
))
);
}
Ok(n) => {
buf = buf.slice(n..);
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return (Slice::into_inner(buf), Err(e)),
Err(e) => return_orig_bounds!(buf, Err(e)),
}
}
(Slice::into_inner(buf), Ok(nbytes))
return_orig_bounds!(buf, Ok(nbytes));
}
async fn write<B: IoBuf + Send>(
@@ -1096,8 +1107,8 @@ impl OwnedAsyncWriter for VirtualFile {
buf: B,
ctx: &RequestContext,
) -> std::io::Result<(usize, B::Buf)> {
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
res.map(move |v| (v, buf))
let (buf, res) = VirtualFile::write_all(self, buf.slice_full(), ctx).await;
res.map(move |v| (v, Slice::into_inner(buf)))
}
}
@@ -1217,9 +1228,9 @@ mod tests {
MaybeVirtualFile::File(file) => file.seek(pos),
}
}
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: B,
buf: Slice<Buf>,
ctx: &RequestContext,
) -> Result<(), Error> {
match self {
@@ -1345,7 +1356,9 @@ mod tests {
&ctx,
)
.await?;
file_a.write_all(b"foobar".to_vec(), &ctx).await?;
file_a
.write_all(b"foobar".to_vec().slice_full(), &ctx)
.await?;
// cannot read from a file opened in write-only mode
let _ = file_a.read_string(&ctx).await.unwrap_err();
@@ -1354,7 +1367,10 @@ mod tests {
let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
// cannot write to a file opened in read-only mode
let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
let _ = file_a
.write_all(b"bar".to_vec().slice_full(), &ctx)
.await
.unwrap_err();
// Try simple read
assert_eq!("foobar", file_a.read_string(&ctx).await?);

View File

@@ -6,7 +6,6 @@ OBJS = \
$(WIN32RES) \
extension_server.o \
file_cache.o \
hll.o \
libpagestore.o \
neon.o \
neon_utils.o \
@@ -23,7 +22,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
EXTENSION = neon
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
EXTRA_CLEAN = \

View File

@@ -26,6 +26,7 @@
#include "miscadmin.h"
#include "pagestore_client.h"
#include "common/hashfn.h"
#include "lib/hyperloglog.h"
#include "pgstat.h"
#include "postmaster/bgworker.h"
#include RELFILEINFO_HDR
@@ -39,8 +40,6 @@
#include "utils/dynahash.h"
#include "utils/guc.h"
#include "hll.h"
/*
* Local file cache is used to temporary store relations pages in local file system.
* All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -63,6 +62,7 @@
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
#define MB ((uint64)1024*1024)
#define HYPER_LOG_LOG_BIT_WIDTH 10
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
typedef struct FileCacheEntry
@@ -87,7 +87,8 @@ typedef struct FileCacheControl
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
HyperLogLogState wss_estimation; /* estimation of working set size */
hyperLogLogState wss_estimation; /* estimation of wroking set size */
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
} FileCacheControl;
static HTAB *lfc_hash;
@@ -237,7 +238,12 @@ lfc_shmem_startup(void)
dlist_init(&lfc_ctl->lru);
/* Initialize hyper-log-log structure for estimating working set size */
initSHLL(&lfc_ctl->wss_estimation);
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
/* We need hashes in shared memory */
pfree(lfc_ctl->wss_estimation.hashesArr);
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -539,7 +545,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/* Approximate working set */
tag.blockNum = blkno;
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
{
@@ -980,38 +986,20 @@ local_cache_pages(PG_FUNCTION_ARGS)
SRF_RETURN_DONE(funcctx);
}
PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
Datum
approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
{
if (lfc_size_limit != 0)
{
int32 dc;
time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
LWLockAcquire(lfc_lock, LW_SHARED);
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
LWLockRelease(lfc_lock);
PG_RETURN_INT32(dc);
}
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(approximate_working_set_size);
Datum
approximate_working_set_size(PG_FUNCTION_ARGS)
{
int32 dc = -1;
if (lfc_size_limit != 0)
{
int32 dc;
bool reset = PG_GETARG_BOOL(0);
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
if (reset)
memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
LWLockRelease(lfc_lock);
PG_RETURN_INT32(dc);
}
PG_RETURN_NULL();
PG_RETURN_INT32(dc);
}

View File

@@ -1,193 +0,0 @@
/*-------------------------------------------------------------------------
*
* hll.c
* Sliding HyperLogLog cardinality estimator
*
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
*
* Implements https://hal.science/hal-00465313/document
*
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
* suited to estimating the cardinality of very large sets; in particular, we
* have not attempted to further optimize the implementation as described in
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
* Engineering of a State of The Art Cardinality Estimation Algorithm".
*
* A sparse representation of HyperLogLog state is used, with fixed space
* overhead.
*
* The copyright terms of Ohno's original version (the MIT license) follow.
*
* IDENTIFICATION
* src/backend/lib/hyperloglog.c
*
*-------------------------------------------------------------------------
*/
/*
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <math.h>
#include "postgres.h"
#include "funcapi.h"
#include "port/pg_bitutils.h"
#include "utils/timestamp.h"
#include "hll.h"
#define POW_2_32 (4294967296.0)
#define NEG_POW_2_32 (-4294967296.0)
#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
/*
* Worker for addHyperLogLog().
*
* Calculates the position of the first set bit in first b bits of x argument
* starting from the first, reading from most significant to least significant
* bits.
*
* Example (when considering fist 10 bits of x):
*
* rho(x = 0b1000000000) returns 1
* rho(x = 0b0010000000) returns 3
* rho(x = 0b0000000000) returns b + 1
*
* "The binary address determined by the first b bits of x"
*
* Return value "j" used to index bit pattern to watch.
*/
static inline uint8
rho(uint32 x, uint8 b)
{
uint8 j = 1;
if (x == 0)
return b + 1;
j = 32 - pg_leftmost_one_pos32(x);
if (j > b)
return b + 1;
return j;
}
/*
* Initialize HyperLogLog track state
*/
void
initSHLL(HyperLogLogState *cState)
{
memset(cState->regs, 0, sizeof(cState->regs));
}
/*
* Adds element to the estimator, from caller-supplied hash.
*
* It is critical that the hash value passed be an actual hash value, typically
* generated using hash_any(). The algorithm relies on a specific bit-pattern
* observable in conjunction with stochastic averaging. There must be a
* uniform distribution of bits in hash values for each distinct original value
* observed.
*/
void
addSHLL(HyperLogLogState *cState, uint32 hash)
{
uint8 count;
uint32 index;
size_t i;
size_t j;
TimestampTz now = GetCurrentTimestamp();
/* Use the first "k" (registerWidth) bits as a zero based index */
index = hash >> HLL_C_BITS;
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
cState->regs[index][count] = now;
}
static uint8
getMaximum(const TimestampTz* reg, TimestampTz since)
{
uint8 max = 0;
for (size_t i = 0; i < HLL_C_BITS + 1; i++)
{
if (reg[i] >= since)
{
max = i;
}
}
return max;
}
/*
* Estimates cardinality, based on elements added so far
*/
double
estimateSHLL(HyperLogLogState *cState, time_t duration)
{
double result;
double sum = 0.0;
size_t i;
uint8 R[HLL_N_REGISTERS];
/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
for (i = 0; i < HLL_N_REGISTERS; i++)
{
R[i] = getMaximum(cState->regs[i], since);
sum += 1.0 / pow(2.0, R[i]);
}
/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
result = ALPHA_MM / sum;
if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
{
/* Small range correction */
int zero_count = 0;
for (i = 0; i < HLL_N_REGISTERS; i++)
{
zero_count += R[i] == 0;
}
if (zero_count != 0)
result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
zero_count);
}
else if (result > (1.0 / 30.0) * POW_2_32)
{
/* Large range correction */
result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
}
return result;
}

View File

@@ -1,86 +0,0 @@
/*-------------------------------------------------------------------------
*
* hll.h
* Sliding HyperLogLog cardinality estimator
*
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
*
* Implements https://hal.science/hal-00465313/document
*
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
* suited to estimating the cardinality of very large sets; in particular, we
* have not attempted to further optimize the implementation as described in
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
* Engineering of a State of The Art Cardinality Estimation Algorithm".
*
* A sparse representation of HyperLogLog state is used, with fixed space
* overhead.
*
* The copyright terms of Ohno's original version (the MIT license) follow.
*
* IDENTIFICATION
* src/backend/lib/hyperloglog.c
*
*-------------------------------------------------------------------------
*/
/*
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the 'Software'), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef HLL_H
#define HLL_H
#define HLL_BIT_WIDTH 10
#define HLL_C_BITS (32 - HLL_BIT_WIDTH)
#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
/*
* HyperLogLog is an approximate technique for computing the number of distinct
* entries in a set. Importantly, it does this by using a fixed amount of
* memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
* cardinality estimation algorithm" for more.
*
* Instead of a single counter for every bits register, we have a timestamp
* for every valid number of bits we can encounter. Every time we encounter
* a certain number of bits, we update the timestamp in those registers to
* the current timestamp.
*
* We can query the sketch's stored cardinality for the range of some timestamp
* up to now: For each register, we return the highest bits bucket that has a
* modified timestamp >= the query timestamp. This value is the number of bits
* for this register in the normal HLL calculation.
*
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
* Usage could be halved if we decide to reduce the required time dimension
* precision; as 32 bits in second precision should be enough for statistics.
* However, that is not yet implemented.
*/
typedef struct HyperLogLogState
{
TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
} HyperLogLogState;
extern void initSHLL(HyperLogLogState *cState);
extern void addSHLL(HyperLogLogState *cState, uint32 hash);
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
#endif

View File

@@ -427,17 +427,12 @@ pageserver_connect(shardno_t shard_no, int elevel)
values[n_pgsql_params] = NULL;
shard->conn = PQconnectStartParams(keywords, values, 1);
if (PQstatus(shard->conn) == CONNECTION_BAD)
if (!shard->conn)
{
char *msg = pchomp(PQerrorMessage(shard->conn));
CLEANUP_AND_DISCONNECT(shard);
ereport(elevel,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
errdetail_internal("%s", msg)));
pfree(msg);
neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
return false;
}
shard->state = PS_Connecting_Startup;
/* fallthrough */
}

View File

@@ -1,9 +0,0 @@
\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
RETURNS integer
AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
LANGUAGE C PARALLEL SAFE;
GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;

View File

@@ -1 +0,0 @@
DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;

View File

@@ -7,7 +7,7 @@ OBJS = \
neontest.o
EXTENSION = neon_test_utils
DATA = neon_test_utils--1.3.sql
DATA = neon_test_utils--1.2.sql
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
PG_CONFIG = pg_config

View File

@@ -45,21 +45,3 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
RETURNS VOID
AS 'MODULE_PATHNAME', 'neon_xlogflush'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION trigger_panic()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_panic'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION trigger_segfault()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_segfault'
LANGUAGE C PARALLEL UNSAFE;
-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
CREATE OR REPLACE FUNCTION 💣() RETURNS void
LANGUAGE plpgsql AS $$
BEGIN
PERFORM trigger_segfault();
END;
$$;

View File

@@ -1,6 +1,6 @@
# neon_test_utils extension
comment = 'helpers for neon testing and debugging'
default_version = '1.3'
default_version = '1.2'
module_pathname = '$libdir/neon_test_utils'
relocatable = true
trusted = true

View File

@@ -42,8 +42,6 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
PG_FUNCTION_INFO_V1(neon_xlogflush);
PG_FUNCTION_INFO_V1(trigger_panic);
PG_FUNCTION_INFO_V1(trigger_segfault);
/*
* Linkage to functions in neon module.
@@ -471,9 +469,9 @@ neon_xlogflush(PG_FUNCTION_ARGS)
* The LSN returned by GetXLogInsertRecPtr() is the position where the
* next inserted record would begin. If the last record ended just at
* the page boundary, the next record will begin after the page header
* on the next page, but the next page's page header has not been
* written yet. If we tried to flush it, XLogFlush() would throw an
* error:
* on the next page, and that's what GetXLogInsertRecPtr().returns,
* but the page header has not been written yet. If we tried to flush
* it, XLogFlush() would throw an error:
*
* ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
*
@@ -491,24 +489,3 @@ neon_xlogflush(PG_FUNCTION_ARGS)
XLogFlush(lsn);
PG_RETURN_VOID();
}
/*
* Function to trigger panic.
*/
Datum
trigger_panic(PG_FUNCTION_ARGS)
{
elog(PANIC, "neon_test_utils: panic");
PG_RETURN_VOID();
}
/*
* Function to trigger a segfault.
*/
Datum
trigger_segfault(PG_FUNCTION_ARGS)
{
int *ptr = NULL;
*ptr = 42;
PG_RETURN_VOID();
}

8
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"
[[package]]
name = "certifi"
version = "2024.7.4"
version = "2023.7.22"
description = "Python package for providing Mozilla's CA Bundle."
optional = false
python-versions = ">=3.6"
files = [
{file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
{file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
{file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
{file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
]
[[package]]

View File

@@ -216,11 +216,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
use pq_proto::FeStartupPacket::*;
match msg {
SslRequest { direct: false } => {
SslRequest => {
stream
.write_message(&pq_proto::BeMessage::EncryptionResponse(true))
.await?;
// Upgrade raw stream into a secure TLS-backed stream.
// NOTE: We've consumed `tls`; this fact will be used later.

View File

@@ -35,7 +35,6 @@ use proxy::usage_metrics;
use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use remote_storage::RemoteStorageConfig;
use std::net::SocketAddr;
use std::pin::pin;
use std::sync::Arc;
@@ -206,8 +205,8 @@ struct ProxyCliArgs {
/// remote storage configuration for backup metric collection
/// Encoded as toml (same format as pageservers), eg
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
#[clap(long, value_parser = remote_storage_from_toml)]
metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
#[clap(long, default_value = "{}")]
metric_backup_collection_remote_storage: String,
/// chunk size for backup metric collection
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
#[clap(long, default_value = "4194304")]
@@ -512,7 +511,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
}
let backup_metric_collection_config = config::MetricBackupCollectionConfig {
interval: args.metric_backup_collection_interval,
remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
remote_storage_config: remote_storage_from_toml(
&args.metric_backup_collection_remote_storage,
)?,
chunk_size: args.metric_backup_collection_chunk_size,
};

View File

@@ -53,13 +53,6 @@ impl<C: Cache, V> Cached<C, V> {
)
}
pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
Cached {
token: self.token,
value: f(self.value),
}
}
/// Drop this entry from a cache if it's still there.
pub fn invalidate(self) -> V {
if let Some((cache, info)) = &self.token {

View File

@@ -65,8 +65,6 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
struct Entry<T> {
created_at: Instant,
expires_at: Instant,
ttl: Duration,
update_ttl_on_retrieval: bool,
value: T,
}
@@ -124,6 +122,7 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
Q: Hash + Eq + ?Sized,
{
let now = Instant::now();
let deadline = now.checked_add(self.ttl).expect("time overflow");
// Do costly things before taking the lock.
let mut cache = self.cache.lock();
@@ -143,8 +142,7 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
// Update the deadline and the entry's position in the LRU list.
let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow");
if raw_entry.get().update_ttl_on_retrieval {
if self.update_ttl_on_retrieval {
raw_entry.get_mut().expires_at = deadline;
}
raw_entry.to_back();
@@ -164,27 +162,12 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
/// existed, return the previous value and its creation timestamp.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval)
}
/// Insert an entry to the cache. If an entry with the same key already
/// existed, return the previous value and its creation timestamp.
#[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
fn insert_raw_ttl(
&self,
key: K,
value: V,
ttl: Duration,
update: bool,
) -> (Instant, Option<V>) {
let created_at = Instant::now();
let expires_at = created_at.checked_add(ttl).expect("time overflow");
let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
let entry = Entry {
created_at,
expires_at,
ttl,
update_ttl_on_retrieval: update,
value,
};
@@ -207,21 +190,6 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
}
impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
self.insert_raw_ttl(key, value, ttl, false);
}
pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
let (created_at, old) = self.insert_raw(key.clone(), value);
let cached = Cached {
token: Some((self, LookupInfo { created_at, key })),
value: (),
};
(old, cached)
}
pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
let (created_at, old) = self.insert_raw(key.clone(), value.clone());

View File

@@ -75,9 +75,6 @@ impl TlsConfig {
}
}
/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
/// Configure TLS for the main endpoint.
pub fn configure_tls(
key_path: &str,
@@ -114,17 +111,16 @@ pub fn configure_tls(
let cert_resolver = Arc::new(cert_resolver);
// allow TLS 1.2 to be compatible with older client libraries
let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
let config = rustls::ServerConfig::builder_with_protocol_versions(&[
&rustls::version::TLS13,
&rustls::version::TLS12,
])
.with_no_client_auth()
.with_cert_resolver(cert_resolver.clone());
config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
.with_cert_resolver(cert_resolver.clone())
.into();
Ok(TlsConfig {
config: Arc::new(config),
config,
common_names,
cert_resolver,
})
@@ -403,11 +399,15 @@ impl FromStr for EndpointCacheConfig {
#[derive(Debug)]
pub struct MetricBackupCollectionConfig {
pub interval: Duration,
pub remote_storage_config: Option<RemoteStorageConfig>,
pub remote_storage_config: OptRemoteStorageConfig,
pub chunk_size: usize,
}
pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<RemoteStorageConfig> {
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
/// runtime type errors from the value parser we use.
pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
RemoteStorageConfig::from_toml(&s.parse()?)
}

View File

@@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry;
/// Generic error response with human-readable description.
/// Note that we can't always present it to user as is.
#[derive(Debug, Deserialize, Clone)]
#[derive(Debug, Deserialize)]
pub struct ConsoleError {
pub error: Box<str>,
#[serde(skip)]
@@ -82,19 +82,41 @@ impl CouldRetry for ConsoleError {
.details
.error_info
.map_or(Reason::Unknown, |e| e.reason);
reason.can_retry()
match reason {
// not a transitive error
Reason::RoleProtected => false,
// on retry, it will still not be found
Reason::ResourceNotFound
| Reason::ProjectNotFound
| Reason::EndpointNotFound
| Reason::BranchNotFound => false,
// we were asked to go away
Reason::RateLimitExceeded
| Reason::NonDefaultBranchComputeTimeExceeded
| Reason::ActiveTimeQuotaExceeded
| Reason::ComputeTimeQuotaExceeded
| Reason::WrittenDataQuotaExceeded
| Reason::DataTransferQuotaExceeded
| Reason::LogicalSizeQuotaExceeded => false,
// transitive error. control plane is currently busy
// but might be ready soon
Reason::RunningOperations => true,
Reason::ConcurrencyLimitReached => true,
Reason::LockAlreadyTaken => true,
// unknown error. better not retry it.
Reason::Unknown => false,
}
}
}
#[derive(Debug, Deserialize, Clone)]
#[derive(Debug, Deserialize)]
pub struct Status {
pub code: Box<str>,
pub message: Box<str>,
pub details: Details,
}
#[derive(Debug, Deserialize, Clone)]
#[derive(Debug, Deserialize)]
pub struct Details {
pub error_info: Option<ErrorInfo>,
pub retry_info: Option<RetryInfo>,
@@ -177,34 +199,6 @@ impl Reason {
| Reason::BranchNotFound
)
}
pub fn can_retry(&self) -> bool {
match self {
// do not retry role protected errors
// not a transitive error
Reason::RoleProtected => false,
// on retry, it will still not be found
Reason::ResourceNotFound
| Reason::ProjectNotFound
| Reason::EndpointNotFound
| Reason::BranchNotFound => false,
// we were asked to go away
Reason::RateLimitExceeded
| Reason::NonDefaultBranchComputeTimeExceeded
| Reason::ActiveTimeQuotaExceeded
| Reason::ComputeTimeQuotaExceeded
| Reason::WrittenDataQuotaExceeded
| Reason::DataTransferQuotaExceeded
| Reason::LogicalSizeQuotaExceeded => false,
// transitive error. control plane is currently busy
// but might be ready soon
Reason::RunningOperations
| Reason::ConcurrencyLimitReached
| Reason::LockAlreadyTaken => true,
// unknown error. better not retry it.
Reason::Unknown => false,
}
}
}
#[derive(Copy, Clone, Debug, Deserialize)]
@@ -212,7 +206,7 @@ pub struct RetryInfo {
pub retry_delay_ms: u64,
}
#[derive(Debug, Deserialize, Clone)]
#[derive(Debug, Deserialize)]
pub struct UserFacingMessage {
pub message: Box<str>,
}

View File

@@ -6,9 +6,8 @@ use anyhow::Context;
use once_cell::sync::Lazy;
use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
use std::convert::Infallible;
use std::{convert::Infallible, future};
use tokio::net::{TcpListener, TcpStream};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, Instrument};
static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -68,9 +67,7 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
pgbackend
.run(&mut MgmtHandler, &CancellationToken::new())
.await
pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
}
/// A message received by `mgmt` when a compute node is ready.

View File

@@ -2,7 +2,7 @@
pub mod mock;
pub mod neon;
use super::messages::{ConsoleError, MetricsAuxInfo};
use super::messages::MetricsAuxInfo;
use crate::{
auth::{
backend::{ComputeCredentialKeys, ComputeUserInfo},
@@ -317,8 +317,8 @@ impl NodeInfo {
}
}
pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;

View File

@@ -9,7 +9,7 @@ use super::{
use crate::{
auth::backend::ComputeUserInfo,
compute,
console::messages::{ColdStartInfo, Reason},
console::messages::ColdStartInfo,
http,
metrics::{CacheOutcome, Metrics},
rate_limiter::EndpointRateLimiter,
@@ -17,10 +17,10 @@ use crate::{
};
use crate::{cache::Cached, context::RequestMonitoring};
use futures::TryFutureExt;
use std::{sync::Arc, time::Duration};
use std::sync::Arc;
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{debug, error, info, info_span, warn, Instrument};
use tracing::{error, info, info_span, warn, Instrument};
pub struct Api {
endpoint: http::Endpoint,
@@ -273,34 +273,26 @@ impl super::Api for Api {
) -> Result<CachedNodeInfo, WakeComputeError> {
let key = user_info.endpoint_cache_key();
macro_rules! check_cache {
() => {
if let Some(cached) = self.caches.node_info.get(&key) {
let (cached, info) = cached.take_value();
let info = info.map_err(|c| {
info!(key = &*key, "found cached wake_compute error");
WakeComputeError::ApiError(ApiError::Console(*c))
})?;
debug!(key = &*key, "found cached compute node info");
ctx.set_project(info.aux.clone());
return Ok(cached.map(|()| info));
}
};
}
// Every time we do a wakeup http request, the compute node will stay up
// for some time (highly depends on the console's scale-to-zero policy);
// The connection info remains the same during that period of time,
// which means that we might cache it to reduce the load and latency.
check_cache!();
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
ctx.set_project(cached.aux.clone());
return Ok(cached);
}
let permit = self.locks.get_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
// double check
if permit.should_check_cache() {
check_cache!();
if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info");
ctx.set_project(cached.aux.clone());
return Ok(cached);
}
}
// check rate limit
@@ -308,56 +300,23 @@ impl super::Api for Api {
.wake_compute_endpoint_rate_limiter
.check(user_info.endpoint.normalize_intern(), 1)
{
info!(key = &*key, "found cached compute node info");
return Err(WakeComputeError::TooManyConnections);
}
let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
match node {
Ok(node) => {
ctx.set_project(node.aux.clone());
debug!(key = &*key, "created a cache entry for woken compute node");
let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
ctx.set_project(node.aux.clone());
let cold_start_info = node.aux.cold_start_info;
info!("woken up a compute node");
let mut stored_node = node.clone();
// store the cached node as 'warm_cached'
stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
// store the cached node as 'warm'
node.aux.cold_start_info = ColdStartInfo::WarmCached;
let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
cached.aux.cold_start_info = cold_start_info;
let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
info!(key = &*key, "created a cache entry for compute node info");
Ok(cached.map(|()| node))
}
Err(err) => match err {
WakeComputeError::ApiError(ApiError::Console(err)) => {
let Some(status) = &err.status else {
return Err(WakeComputeError::ApiError(ApiError::Console(err)));
};
let reason = status
.details
.error_info
.map_or(Reason::Unknown, |x| x.reason);
// if we can retry this error, do not cache it.
if reason.can_retry() {
return Err(WakeComputeError::ApiError(ApiError::Console(err)));
}
// at this point, we should only have quota errors.
debug!(
key = &*key,
"created a cache entry for the wake compute error"
);
self.caches.node_info.insert_ttl(
key,
Err(Box::new(err.clone())),
Duration::from_secs(30),
);
Err(WakeComputeError::ApiError(ApiError::Console(err)))
}
err => return Err(err),
},
}
Ok(cached)
}
}

View File

@@ -14,14 +14,17 @@ use parquet::{
record::RecordWriter,
};
use pq_proto::StartupMessageParams;
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use serde::ser::SerializeMap;
use tokio::{sync::mpsc, time};
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, Span};
use utils::backoff;
use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
use crate::{
config::{remote_storage_from_toml, OptRemoteStorageConfig},
context::LOG_CHAN_DISCONNECT,
};
use super::{RequestMonitoring, LOG_CHAN};
@@ -30,11 +33,11 @@ pub struct ParquetUploadArgs {
/// Storage location to upload the parquet files to.
/// Encoded as toml (same format as pageservers), eg
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
#[clap(long, value_parser = remote_storage_from_toml)]
parquet_upload_remote_storage: Option<RemoteStorageConfig>,
#[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
parquet_upload_remote_storage: OptRemoteStorageConfig,
#[clap(long, value_parser = remote_storage_from_toml)]
parquet_upload_disconnect_events_remote_storage: Option<RemoteStorageConfig>,
#[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
/// How many rows to include in a row group
#[clap(long, default_value_t = 8192)]

View File

@@ -3,8 +3,8 @@ use std::marker::PhantomData;
use measured::{
label::NoLabels,
metric::{
gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
MetricFamilyEncoding, MetricType,
gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
MetricEncoding, MetricFamilyEncoding, MetricType,
},
text::TextEncoder,
LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
if let Ok(v) = mib.read() {
GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
}
Ok(())
}

View File

@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};
use lasso::ThreadedRodeo;
use measured::{
label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
metric::{histogram::Thresholds, name::MetricName},
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
LabelGroup, MetricGroup,
@@ -577,32 +577,6 @@ impl LabelGroup for ThreadPoolWorkerId {
}
}
impl LabelGroupSet for ThreadPoolWorkers {
type Group<'a> = ThreadPoolWorkerId;
fn cardinality(&self) -> Option<usize> {
Some(self.0)
}
fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
Some(value)
}
fn decode_dense(&self, value: usize) -> Self::Group<'_> {
ThreadPoolWorkerId(value)
}
type Unique = usize;
fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
Some(value.0)
}
fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
ThreadPoolWorkerId(*value)
}
}
impl LabelSet for ThreadPoolWorkers {
type Value<'a> = ThreadPoolWorkerId;

View File

@@ -1,17 +1,11 @@
use bytes::Buf;
use pq_proto::{
framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
StartupMessageParams,
};
use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
use tracing::info;
use crate::{
auth::endpoint_sni,
config::{TlsConfig, PG_ALPN_PROTOCOL},
config::TlsConfig,
error::ReportableError,
metrics::Metrics,
proxy::ERR_INSECURE_CONNECTION,
stream::{PqStream, Stream, StreamUpgradeError},
};
@@ -74,9 +68,6 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
// Client may try upgrading to each protocol only once
let (mut tried_ssl, mut tried_gss) = (false, false);
const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
let mut stream = PqStream::new(Stream::from_raw(stream));
loop {
let msg = stream.read_startup_packet().await?;
@@ -84,96 +75,40 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
use FeStartupPacket::*;
match msg {
SslRequest { direct } => match stream.get_ref() {
SslRequest => match stream.get_ref() {
Stream::Raw { .. } if !tried_ssl => {
tried_ssl = true;
// We can't perform TLS handshake without a config
let have_tls = tls.is_some();
if !direct {
stream
.write_message(&Be::EncryptionResponse(have_tls))
.await?;
} else if !have_tls {
return Err(HandshakeError::ProtocolViolation);
}
let enc = tls.is_some();
stream.write_message(&Be::EncryptionResponse(enc)).await?;
if let Some(tls) = tls.take() {
// Upgrade raw stream into a secure TLS-backed stream.
// NOTE: We've consumed `tls`; this fact will be used later.
let Framed {
stream: raw,
read_buf,
write_buf,
} = stream.framed;
let Stream::Raw { raw } = raw else {
return Err(HandshakeError::StreamUpgradeError(
StreamUpgradeError::AlreadyTls,
));
};
let mut read_buf = read_buf.reader();
let mut res = Ok(());
let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
.accept_with(raw, |session| {
// push the early data to the tls session
while !read_buf.get_ref().is_empty() {
match session.read_tls(&mut read_buf) {
Ok(_) => {}
Err(e) => {
res = Err(e);
break;
}
}
}
});
res?;
let read_buf = read_buf.into_inner();
let (raw, read_buf) = stream.into_inner();
// TODO: Normally, client doesn't send any data before
// server says TLS handshake is ok and read_buf is empy.
// However, you could imagine pipelining of postgres
// SSLRequest + TLS ClientHello in one hunk similar to
// pipelining in our node js driver. We should probably
// support that by chaining read_buf with the stream.
if !read_buf.is_empty() {
return Err(HandshakeError::EarlyData);
}
let tls_stream = accept.await.inspect_err(|_| {
if record_handshake_error {
Metrics::get().proxy.tls_handshake_failures.inc()
}
})?;
let conn_info = tls_stream.get_ref().1;
// check the ALPN, if exists, as required.
match conn_info.alpn_protocol() {
None | Some(PG_ALPN_PROTOCOL) => {}
Some(other) => {
// try parse ep for better error
let ep = conn_info.server_name().and_then(|sni| {
endpoint_sni(sni, &tls.common_names).ok().flatten()
});
let alpn = String::from_utf8_lossy(other);
warn!(?ep, %alpn, "unexpected ALPN");
return Err(HandshakeError::ProtocolViolation);
}
}
let tls_stream = raw
.upgrade(tls.to_server_config(), record_handshake_error)
.await?;
let (_, tls_server_end_point) = tls
.cert_resolver
.resolve(conn_info.server_name())
.resolve(tls_stream.get_ref().1.server_name())
.ok_or(HandshakeError::MissingCertificate)?;
stream = PqStream {
framed: Framed {
stream: Stream::Tls {
tls: Box::new(tls_stream),
tls_server_end_point,
},
read_buf,
write_buf,
},
};
stream = PqStream::new(Stream::Tls {
tls: Box::new(tls_stream),
tls_server_end_point,
});
}
}
_ => return Err(HandshakeError::ProtocolViolation),
@@ -187,9 +122,7 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
_ => return Err(HandshakeError::ProtocolViolation),
},
StartupMessage { params, version }
if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
{
StartupMessage { params, .. } => {
// Check that the config has been consumed during upgrade
// OR we didn't provide it at all (for dev purposes).
if tls.is_some() {
@@ -198,48 +131,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
.await?;
}
info!(?version, session_type = "normal", "successful handshake");
info!(session_type = "normal", "successful handshake");
break Ok(HandshakeData::Startup(stream, params));
}
// downgrade protocol version
StartupMessage { params, version }
if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
{
warn!(?version, "unsupported minor version");
// no protocol extensions are supported.
// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
let mut unsupported = vec![];
for (k, _) in params.iter() {
if k.starts_with("_pq_.") {
unsupported.push(k);
}
}
// TODO: remove unsupported options so we don't send them to compute.
stream
.write_message(&Be::NegotiateProtocolVersion {
version: PG_PROTOCOL_LATEST,
options: &unsupported,
})
.await?;
info!(
?version,
session_type = "normal",
"successful handshake; unsupported minor version requested"
);
break Ok(HandshakeData::Startup(stream, params));
}
StartupMessage { version, .. } => {
warn!(
?version,
session_type = "normal",
"unsuccessful handshake; unsupported version"
);
return Err(HandshakeError::ProtocolViolation);
}
CancelRequest(cancel_key_data) => {
info!(session_type = "cancellation", "successful handshake");
break Ok(HandshakeData::Cancel(cancel_key_data));

Some files were not shown because too many files have changed in this diff Show More