Compare commits

..

19 Commits

Author SHA1 Message Date
Conrad Ludgate
18303e4d68 clean up 2024-08-13 15:08:57 +01:00
Conrad Ludgate
3df6d368e3 split out binaries 2024-08-13 15:08:57 +01:00
Conrad Ludgate
b62e7c0138 proxy: experiment with idea to split crates 2024-08-13 15:08:54 +01:00
Conrad Ludgate
a2968c6cf8 move proxy to proxy/code 2024-08-13 15:01:48 +01:00
Conrad Ludgate
bae1288671 make jwk renewal permits a bit more type safe 2024-08-13 11:08:25 +01:00
Conrad Ludgate
1254d8f56e address some comments 2024-08-13 10:24:14 +01:00
Conrad Ludgate
073508493c remove async_trait for FetchAuthRules 2024-08-12 16:14:53 +01:00
Conrad Ludgate
7cb2349296 add jwks size limiter 2024-08-12 11:48:57 +01:00
Conrad Ludgate
87151f9efd ignore marvin vuln 2024-08-12 09:01:30 +01:00
Conrad Ludgate
96fe084c57 compact mock server 2024-08-12 09:01:04 +01:00
Conrad Ludgate
20fdf3e19f extract fetch/update routine 2024-08-12 09:01:04 +01:00
Conrad Ludgate
c6b36d8171 fix lints 2024-08-12 09:01:04 +01:00
Conrad Ludgate
0e8a848937 finish happy path test 2024-08-12 09:01:04 +01:00
Conrad Ludgate
db4085fe22 mock tests for jwk renewal 2024-08-12 09:01:04 +01:00
Conrad Ludgate
0d895ba002 strip down supported algorithms to just RS256 and ES256 2024-08-12 09:01:04 +01:00
Conrad Ludgate
103f34e954 flesh out JWKs cache 2024-08-12 09:01:04 +01:00
Conrad Ludgate
262378e561 flesh out jwt code 2024-08-12 09:01:04 +01:00
Conrad Ludgate
9f38ab39c6 stash jwts 2024-08-12 09:01:04 +01:00
Conrad Ludgate
fa92328423 start stubbing jwt 2024-08-12 09:01:04 +01:00
363 changed files with 7404 additions and 17237 deletions

View File

@@ -23,30 +23,10 @@ platforms = [
]
[final-excludes]
workspace-members = [
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
# from depending on workspace-hack because most of the dependencies are not used.
"vm_monitor",
# All of these exist in libs and are not usually built independently.
# Putting workspace hack there adds a bottleneck for cargo builds.
"compute_api",
"consumption_metrics",
"desim",
"metrics",
"pageserver_api",
"postgres_backend",
"postgres_connection",
"postgres_ffi",
"pq_proto",
"remote_storage",
"safekeeper_api",
"tenant_size_model",
"tracing-utils",
"utils",
"wal_craft",
"walproposer",
]
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
# from depending on workspace-hack because most of the dependencies are not used.
workspace-members = ["vm_monitor"]
# Write out exact versions rather than a semver range. (Defaults to false.)
# exact-versions = true

View File

@@ -1,6 +0,0 @@
blank_issues_enabled: true
contact_links:
- name: Feature request
url: https://console.neon.tech/app/projects?modal=feedback
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`

View File

@@ -1,6 +1,7 @@
self-hosted-runner:
labels:
- arm64
- gen3
- large
- large-arm64
- small

View File

@@ -43,7 +43,7 @@ inputs:
pg_version:
description: 'Postgres version to use for tests'
required: false
default: 'v16'
default: 'v14'
benchmark_durations:
description: 'benchmark durations JSON'
required: false
@@ -71,7 +71,7 @@ runs:
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
prefix: latest
# The lack of compatibility snapshot (for example, for the new Postgres version)
@@ -83,6 +83,7 @@ runs:
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
@@ -169,8 +170,10 @@ runs:
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
fi
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
cov_prefix=()
else
cov_prefix=()
fi
@@ -211,13 +214,13 @@ runs:
fi
- name: Upload compatibility snapshot
# Note, that we use `github.base_ref` which is a target branch for a PR
if: github.event_name == 'pull_request' && github.base_ref == 'release'
if: github.ref_name == 'release'
uses: ./.github/actions/upload
with:
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
prefix: latest
- name: Upload test results
if: ${{ !cancelled() }}

View File

@@ -1,36 +0,0 @@
name: "Set custom docker config directory"
description: "Create a directory for docker config and set DOCKER_CONFIG"
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
runs:
using: "composite"
steps:
- name: Show warning on GitHub-hosted runners
if: runner.environment == 'github-hosted'
shell: bash -euo pipefail {0}
run: |
# Using the following environment variables to find a path to the workflow file
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
# ${GITHUB_REPOSITORY} - octocat/hello-world
# ${GITHUB_REF} - refs/heads/my_branch
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
filename=${filename_with_ref%"@$GITHUB_REF"}
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
echo "::warning file=${filename},title=${title}::${message}"
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
env:
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
with:
main: |
mkdir -p "${DOCKER_CONFIG}"
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
post: |
if [ -d "${DOCKER_CONFIG}" ]; then
rm -r "${DOCKER_CONFIG}"
fi

View File

@@ -1,154 +0,0 @@
name: Prepare benchmarking databases by restoring dumps
on:
workflow_call:
# no inputs needed
defaults:
run:
shell: bash -euxo pipefail {0}
jobs:
setup-databases:
strategy:
fail-fast: false
matrix:
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
database: [ clickbench, tpch, userexample ]
env:
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
PLATFORM: ${{ matrix.platform }}
PG_BINARIES: /tmp/neon/pg_install/v16/bin
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init
steps:
- name: Set up Connection String
id: set-up-prep-connstr
run: |
case "${PLATFORM}" in
neon)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
aws-rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;;
aws-aurora-serverless-v2-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
- name: Create benchmark_restore_status table if it does not exist
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
DATABASE_NAME: ${{ matrix.database }}
# to avoid a race condition of multiple jobs trying to create the table at the same time,
# we use an advisory lock
run: |
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
SELECT pg_advisory_lock(4711);
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
databasename text primary key,
restore_done boolean
);
SELECT pg_advisory_unlock(4711);
"
- name: Check if restore is already done
id: check-restore-done
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
DATABASE_NAME: ${{ matrix.database }}
run: |
skip=false
if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
skip=true
fi
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
- name: Check and create database if it does not exist
if: steps.check-restore-done.outputs.skip != 'true'
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
DATABASE_NAME: ${{ matrix.database }}
run: |
DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
if [ "$DB_EXISTS" != "1" ]; then
echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
else
echo "Database ${{ env.DATABASE_NAME }} already exists."
fi
- name: Download dump from S3 to /tmp/dumps
if: steps.check-restore-done.outputs.skip != 'true'
env:
DATABASE_NAME: ${{ matrix.database }}
run: |
mkdir -p /tmp/dumps
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
- name: Replace database name in connection string
if: steps.check-restore-done.outputs.skip != 'true'
id: replace-dbname
env:
DATABASE_NAME: ${{ matrix.database }}
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
run: |
# Extract the part before the database name
base_connstr="${BENCHMARK_CONNSTR%/*}"
# Extract the query parameters (if any) after the database name
query_params="${BENCHMARK_CONNSTR#*\?}"
# Reconstruct the new connection string
if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
else
new_connstr="${base_connstr}/${DATABASE_NAME}"
fi
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
- name: Restore dump
if: steps.check-restore-done.outputs.skip != 'true'
env:
DATABASE_NAME: ${{ matrix.database }}
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
# the following works only with larger computes:
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
# we add the || true because:
# the dumps were created with Neon and contain neon extensions that are not
# available in RDS, so we will always report an error, but we can ignore it
run: |
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
-d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
- name: Update benchmark_restore_status table
if: steps.check-restore-done.outputs.skip != 'true'
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
DATABASE_NAME: ${{ matrix.database }}
run: |
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
"

View File

@@ -70,6 +70,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
@@ -94,16 +95,11 @@ jobs:
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
env:
ARCH: ${{ inputs.arch }}
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release"
@@ -163,8 +159,6 @@ jobs:
# Do install *before* running rust tests because they might recompile the
# binaries with different features/flags.
- name: Install rust binaries
env:
ARCH: ${{ inputs.arch }}
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
@@ -179,7 +173,7 @@ jobs:
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
@@ -214,16 +208,10 @@ jobs:
export LD_LIBRARY_PATH
#nextest does not yet support running doctests
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
# run all non-pageserver tests
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
# run pageserver tests with different settings
for io_engine in std-fs tokio-epoll-uring ; do
for io_buffer_alignment in 0 1 512 ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
done
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
# Run separate tests for real S3
@@ -256,8 +244,8 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
# Don't run regression tests on debug arm64 builds
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
# Run test on x64 only
if: inputs.arch == 'x64'
needs: [ build-neon ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
container:
@@ -275,6 +263,7 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set

View File

@@ -44,7 +44,7 @@ jobs:
grep -ERl $PAT .github/workflows |\
while read -r f
do
l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
done
exit 1

View File

@@ -96,7 +96,7 @@ jobs:
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
@@ -146,7 +146,6 @@ jobs:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -155,10 +154,7 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic perf testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -180,7 +176,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
@@ -219,23 +215,15 @@ jobs:
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
slack-message: |
Periodic replication testing: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -292,9 +280,8 @@ jobs:
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
}'
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -334,13 +321,9 @@ jobs:
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
prepare_AWS_RDS_databases:
uses: ./.github/workflows/_benchmarking_preparation.yml
secrets: inherit
pgbench-compare:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
needs: [ generate-matrices, prepare_AWS_RDS_databases ]
needs: [ generate-matrices ]
permissions:
contents: write
statuses: write
@@ -377,7 +360,7 @@ jobs:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
@@ -472,7 +455,6 @@ jobs:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -481,10 +463,7 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -558,7 +537,7 @@ jobs:
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
uses: aws-actions/configure-aws-credentials@v4
with:
@@ -593,9 +572,8 @@ jobs:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -604,10 +582,7 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -620,7 +595,7 @@ jobs:
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
needs: [ generate-matrices, pgbench-compare ]
strategy:
fail-fast: false
@@ -628,7 +603,7 @@ jobs:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -680,7 +655,6 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -690,7 +664,6 @@ jobs:
TEST_OLAP_SCALE: 10
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -699,10 +672,7 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -714,7 +684,7 @@ jobs:
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
needs: [ generate-matrices, clickbench-compare ]
strategy:
fail-fast: false
@@ -722,7 +692,7 @@ jobs:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -754,7 +724,7 @@ jobs:
ENV_PLATFORM=RDS_AURORA_TPCH
;;
rds-postgres)
ENV_PLATFORM=RDS_POSTGRES_TPCH
ENV_PLATFORM=RDS_AURORA_TPCH
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -780,7 +750,6 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -788,7 +757,6 @@ jobs:
TEST_OLAP_SCALE: ${{ matrix.scale }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -797,16 +765,13 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
user-examples-compare:
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
needs: [ generate-matrices, tpch-compare ]
strategy:
fail-fast: false
@@ -814,7 +779,7 @@ jobs:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -871,7 +836,6 @@ jobs:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
- name: Create Allure report
id: create-allure-report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate
@@ -880,10 +844,6 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: |
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
env:
IMAGE_TAG: ${{ inputs.image-tag }}
@@ -56,7 +56,13 @@ jobs:
- uses: actions/checkout@v4
- uses: ./.github/actions/set-docker-config-dir
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p /tmp/.docker-custom
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -83,6 +89,11 @@ jobs:
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf /tmp/.docker-custom
merge-images:
needs: [ build-image ]
runs-on: ubuntu-22.04

View File

@@ -48,7 +48,7 @@ jobs:
tag:
needs: [ check-permissions ]
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -90,7 +90,7 @@ jobs:
check-codestyle-python:
needs: [ check-permissions, build-build-tools-image ]
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -101,6 +101,9 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
uses: actions/cache@v4
@@ -122,12 +125,10 @@ jobs:
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
# There's no reason to expect clippy or code formatting to be different on different platforms,
# so it's enough to run these on x64 only.
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -141,6 +142,7 @@ jobs:
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
# Disabled for now
# - name: Restore cargo deps cache
@@ -168,25 +170,15 @@ jobs:
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
run: |
parallel --jobs 8 "cargo hack --feature-powerset --partition {}/8 clippy --target-dir target/partition-{} $CLIPPY_COMMON_ARGS" ::: 1 2 3 4 5 6 7 8
# instead of running the full release build, running debug build again,
# but with disabled `debug-assertions` to excersice release code paths
- name: Run cargo clippy (debug, with debug-assertions=false)
run: |
for N in 4 8 10 12 14 16 18 20; do
echo "Running clippy with debug-assertions=false for partition ${N}"
time parallel --jobs ${N} "cargo hack --feature-powerset --partition {}/${N} clippy --target-dir target/partition-{} $CLIPPY_COMMON_ARGS -C debug-assertions=off" ::: $(seq -s " " 1 ${N})
rm -rf target/partition-*
done
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
@@ -210,9 +202,9 @@ jobs:
strategy:
fail-fast: false
matrix:
arch: [ x64, arm64 ]
arch: [ x64 ]
# Do not build or run tests in debug for release branches
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
include:
- build-type: release
arch: arm64
@@ -232,7 +224,7 @@ jobs:
outputs:
json: ${{ steps.get-benchmark-durations.outputs.json }}
needs: [ check-permissions, build-build-tools-image ]
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -265,7 +257,7 @@ jobs:
benchmarks:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -292,7 +284,6 @@ jobs:
save_perf_report: ${{ github.ref_name == 'main' }}
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
pg_version: v16
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -311,8 +302,9 @@ jobs:
with:
channel-id: C060CNA47S9 # on-call-staging-storage-stream
slack-message: |
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
Benchmarks failed on main: ${{ github.event.head_commit.url }}
Allure report: ${{ needs.create-test-report.outputs.report-url }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
@@ -322,7 +314,7 @@ jobs:
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -369,7 +361,7 @@ jobs:
coverage-report:
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -483,7 +475,7 @@ jobs:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- name: Checkout
@@ -492,7 +484,12 @@ jobs:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -511,10 +508,7 @@ jobs:
- uses: docker/build-push-action@v6
with:
context: .
# ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure)
# https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md
build-args: |
ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
@@ -527,6 +521,11 @@ jobs:
tags: |
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
neon-image:
needs: [ neon-image-arch, tag ]
runs-on: ubuntu-22.04
@@ -562,7 +561,7 @@ jobs:
version: [ v14, v15, v16 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
steps:
- name: Checkout
@@ -571,7 +570,12 @@ jobs:
submodules: true
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v3
with:
cache-binary: false
@@ -654,6 +658,11 @@ jobs:
tags: |
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
compute-node-image:
needs: [ compute-node-image-arch, tag ]
runs-on: ubuntu-22.04
@@ -707,7 +716,7 @@ jobs:
vm-compute-node-image:
needs: [ check-permissions, tag, compute-node-image ]
runs-on: [ self-hosted, large ]
runs-on: [ self-hosted, gen3, large ]
strategy:
fail-fast: false
matrix:
@@ -726,7 +735,13 @@ jobs:
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
chmod +x vm-builder
- uses: ./.github/actions/set-docker-config-dir
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -749,6 +764,11 @@ jobs:
run: |
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
test-images:
needs: [ check-permissions, tag, neon-image, compute-node-image ]
strategy:
@@ -756,7 +776,7 @@ jobs:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
steps:
- name: Checkout
@@ -764,7 +784,13 @@ jobs:
with:
fetch-depth: 0
- uses: ./.github/actions/set-docker-config-dir
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -804,6 +830,11 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml logs || 0
docker compose -f ./docker-compose/docker-compose.yml down
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
promote-images:
permissions:
contents: read # This is required for actions/checkout
@@ -971,7 +1002,7 @@ jobs:
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
steps:
- name: Fix git ownership
@@ -991,6 +1022,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: false
fetch-depth: 0
- name: Trigger deploy workflow
@@ -998,10 +1030,10 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
@@ -1011,14 +1043,14 @@ jobs:
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployStorage=true \
-f deployStorageBroker=true \
-f deployStorageController=true \
-f branch=main \
-f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f deployStorage=false \
@@ -1028,7 +1060,7 @@ jobs:
-f dockerTag=${{needs.tag.outputs.build-tag}} \
-f deployPreprodRegion=true
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
-f deployPgSniRouter=true \
-f deployProxy=true \
-f branch=main \
@@ -1067,88 +1099,43 @@ jobs:
generate_release_notes: true,
})
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
promote-compatibility-data:
needs: [ deploy ]
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
if: github.ref_name == 'release'
runs-on: ubuntu-22.04
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
steps:
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
id: fetch-last-release-pr-info
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
branch_name_and_pr_number=$(gh pr list \
--repo "${GITHUB_REPOSITORY}" \
--base release \
--state merged \
--limit 10 \
--json mergeCommit,headRefName,number \
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
run_id=$(gh run list \
--repo "${GITHUB_REPOSITORY}" \
--workflow build_and_test.yml \
--branch "${branch_name}" \
--json databaseId \
--limit 1 \
--jq '.[].databaseId')
last_commit_sha=$(gh pr view "${pr_number}" \
--repo "${GITHUB_REPOSITORY}" \
--json commits \
--jq '.commits[-1].oid')
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
- name: Promote compatibility snapshot and Neon artifact
- name: Promote compatibility snapshot for the release
env:
BUCKET: neon-github-public-dev
AWS_REGION: eu-central-1
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
PREFIX: artifacts/latest
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
new_prefix="artifacts/latest"
files_to_promote=()
files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
for arch in X64 ARM64; do
# Update compatibility snapshot for the release
for pg_version in v14 v15 v16; do
for build_type in debug release; do
neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
if [ -z "${s3_key}" ]; then
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
exit 1
fi
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
files_to_promote+=("s3://${BUCKET}/${s3_key}")
for pg_version in v14 v15 v16; do
# We run less tests for debug builds, so we don't need to promote them
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
continue
fi
compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
if [ -z "${s3_key}" ]; then
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
exit 1
fi
files_to_promote+=("s3://${BUCKET}/${s3_key}")
done
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
done
for f in "${files_to_promote[@]}"; do
time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done
pin-build-tools-image:
@@ -1172,12 +1159,10 @@ jobs:
# Format `needs` differently to make the list more readable.
# Usually we do `needs: [...]`
needs:
- build-and-test-locally
- check-codestyle-python
- check-codestyle-rust
- promote-images
- build-and-test-locally
- test-images
- trigger-custom-extensions-build-and-wait
runs-on: ubuntu-22.04
steps:
# The list of possible results:

View File

@@ -1,54 +0,0 @@
name: Add `external` label to issues and PRs created by external users
on:
issues:
types:
- opened
pull_request_target:
types:
- opened
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
env:
LABEL: external
jobs:
check-user:
runs-on: ubuntu-22.04
outputs:
is-member: ${{ steps.check-user.outputs.is-member }}
steps:
- name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
id: check-user
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
is_member=true
else
is_member=false
fi
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
add-label:
if: needs.check-user.outputs.is-member == 'false'
needs: [ check-user ]
runs-on: ubuntu-22.04
permissions:
pull-requests: write # for `gh pr edit`
issues: write # for `gh issue edit`
steps:
- name: Add `${{ env.LABEL }}` label
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
run: |
gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}

View File

@@ -56,6 +56,7 @@ jobs:
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Install macOS postgres dependencies
run: brew install flex bison openssl protobuf icu4c pkg-config
@@ -157,6 +158,7 @@ jobs:
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers

View File

@@ -27,7 +27,7 @@ concurrency:
jobs:
trigger_bench_on_ec2_machine_in_eu_central_1:
runs-on: [ self-hosted, small ]
runs-on: [ self-hosted, gen3, small ]
container:
image: neondatabase/build-tools:pinned
credentials:

143
Cargo.lock generated
View File

@@ -936,12 +936,6 @@ dependencies = [
"which",
]
[[package]]
name = "bit_field"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -1214,6 +1208,7 @@ dependencies = [
"serde_json",
"serde_with",
"utils",
"workspace_hack",
]
[[package]]
@@ -1326,6 +1321,7 @@ dependencies = [
"serde",
"serde_with",
"utils",
"workspace_hack",
]
[[package]]
@@ -1333,6 +1329,7 @@ name = "control_plane"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"camino",
"clap",
"comfy-table",
@@ -1673,13 +1670,14 @@ dependencies = [
"smallvec",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
name = "diesel"
version = "2.2.3"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
dependencies = [
"bitflags 2.4.1",
"byteorder",
@@ -2949,6 +2947,17 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "leaky-bucket"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
dependencies = [
"parking_lot 0.12.1",
"tokio",
"tracing",
]
[[package]]
name = "libc"
version = "0.2.150"
@@ -3138,6 +3147,7 @@ dependencies = [
"rand 0.8.5",
"rand_distr",
"twox-hash",
"workspace_hack",
]
[[package]]
@@ -3677,7 +3687,6 @@ dependencies = [
"async-compression",
"async-stream",
"async-trait",
"bit_field",
"byteorder",
"bytes",
"camino",
@@ -3702,6 +3711,7 @@ dependencies = [
"humantime-serde",
"hyper 0.14.26",
"itertools 0.10.5",
"leaky-bucket",
"md5",
"metrics",
"nix 0.27.1",
@@ -3726,7 +3736,6 @@ dependencies = [
"reqwest 0.12.4",
"rpds",
"scopeguard",
"send-future",
"serde",
"serde_json",
"serde_path_to_error",
@@ -3782,6 +3791,7 @@ dependencies = [
"strum_macros",
"thiserror",
"utils",
"workspace_hack",
]
[[package]]
@@ -3789,6 +3799,7 @@ name = "pageserver_client"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"pageserver_api",
@@ -4005,6 +4016,29 @@ dependencies = [
"indexmap 1.9.3",
]
[[package]]
name = "pg_sni_router"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"futures",
"git-version",
"itertools 0.10.5",
"pq_proto",
"proxy-core",
"proxy-sasl",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"socket2 0.5.5",
"tokio",
"tokio-util",
"tracing",
"tracing-utils",
"utils",
"uuid",
]
[[package]]
name = "phf"
version = "0.11.1"
@@ -4182,6 +4216,7 @@ dependencies = [
"tokio-rustls 0.25.0",
"tokio-util",
"tracing",
"workspace_hack",
]
[[package]]
@@ -4194,6 +4229,7 @@ dependencies = [
"postgres",
"tokio-postgres",
"url",
"workspace_hack",
]
[[package]]
@@ -4216,6 +4252,7 @@ dependencies = [
"serde",
"thiserror",
"utils",
"workspace_hack",
]
[[package]]
@@ -4253,6 +4290,7 @@ dependencies = [
"thiserror",
"tokio",
"tracing",
"workspace_hack",
]
[[package]]
@@ -4398,6 +4436,34 @@ dependencies = [
[[package]]
name = "proxy"
version = "0.1.0"
dependencies = [
"anyhow",
"aws-config",
"clap",
"futures",
"git-version",
"humantime",
"itertools 0.10.5",
"metrics",
"pq_proto",
"proxy-core",
"proxy-sasl",
"remote_storage",
"rustls 0.22.4",
"rustls-pemfile 2.1.1",
"socket2 0.5.5",
"tikv-jemallocator",
"tokio",
"tokio-util",
"tracing",
"tracing-utils",
"utils",
"uuid",
]
[[package]]
name = "proxy-core"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
@@ -4424,7 +4490,6 @@ dependencies = [
"fallible-iterator",
"framed-websockets",
"futures",
"git-version",
"hashbrown 0.14.5",
"hashlink",
"hex",
@@ -4447,7 +4512,6 @@ dependencies = [
"measured",
"metrics",
"once_cell",
"opentelemetry",
"p256 0.13.2",
"parking_lot 0.12.1",
"parquet",
@@ -4457,7 +4521,7 @@ dependencies = [
"postgres-protocol",
"postgres_backend",
"pq_proto",
"prometheus",
"proxy-sasl",
"rand 0.8.5",
"rand_distr",
"rcgen",
@@ -4487,7 +4551,6 @@ dependencies = [
"task-local-extensions",
"thiserror",
"tikv-jemalloc-ctl",
"tikv-jemallocator",
"tokio",
"tokio-postgres",
"tokio-postgres-rustls",
@@ -4510,6 +4573,35 @@ dependencies = [
"x509-parser",
]
[[package]]
name = "proxy-sasl"
version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"base64 0.13.1",
"bytes",
"crossbeam-deque",
"hmac",
"itertools 0.10.5",
"lasso",
"measured",
"parking_lot 0.12.1",
"pbkdf2",
"postgres-protocol",
"pq_proto",
"rand 0.8.5",
"rustls 0.22.4",
"sha2",
"subtle",
"thiserror",
"tokio",
"tracing",
"uuid",
"workspace_hack",
"x509-parser",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
@@ -4817,6 +4909,7 @@ dependencies = [
"toml_edit 0.19.10",
"tracing",
"utils",
"workspace_hack",
]
[[package]]
@@ -5341,6 +5434,7 @@ dependencies = [
"serde",
"serde_with",
"utils",
"workspace_hack",
]
[[package]]
@@ -5449,12 +5543,6 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
[[package]]
name = "send-future"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
[[package]]
name = "sentry"
version = "0.32.3"
@@ -5590,12 +5678,11 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.125"
version = "1.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
@@ -5950,6 +6037,7 @@ name = "storage_controller_client"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"pageserver_api",
@@ -6182,6 +6270,7 @@ dependencies = [
"anyhow",
"serde",
"serde_json",
"workspace_hack",
]
[[package]]
@@ -6782,6 +6871,7 @@ dependencies = [
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
"workspace_hack",
]
[[package]]
@@ -6952,6 +7042,7 @@ dependencies = [
"anyhow",
"arc-swap",
"async-compression",
"async-trait",
"bincode",
"byteorder",
"bytes",
@@ -6967,6 +7058,7 @@ dependencies = [
"humantime",
"hyper 0.14.26",
"jsonwebtoken",
"leaky-bucket",
"metrics",
"nix 0.27.1",
"once_cell",
@@ -6997,6 +7089,7 @@ dependencies = [
"url",
"uuid",
"walkdir",
"workspace_hack",
]
[[package]]
@@ -7075,6 +7168,7 @@ dependencies = [
"postgres_ffi",
"regex",
"utils",
"workspace_hack",
]
[[package]]
@@ -7095,6 +7189,7 @@ dependencies = [
"bindgen",
"postgres_ffi",
"utils",
"workspace_hack",
]
[[package]]
@@ -7651,6 +7746,8 @@ dependencies = [
"tokio",
"tokio-rustls 0.24.0",
"tokio-util",
"toml_datetime",
"toml_edit 0.19.10",
"tonic",
"tower",
"tracing",

View File

@@ -9,7 +9,10 @@ members = [
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
"proxy",
"proxy/core",
"proxy/sasl",
"proxy/proxy",
"proxy/pg_sni_router",
"safekeeper",
"storage_broker",
"storage_controller",
@@ -65,7 +68,6 @@ axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.65"
bit_field = "0.10.2"
bstr = "1.0"
byteorder = "1.4"
bytes = "1.0"
@@ -108,12 +110,13 @@ ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "9"
lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.8"
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
notify = "6.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
@@ -145,7 +148,6 @@ rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
send-future = "0.1.0"
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"

View File

@@ -35,9 +35,8 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
COPY --chown=nonroot . .
ARG ADDITIONAL_RUSTFLAGS
RUN set -e \
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \

View File

@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
COPY patches/pg_hint_plan.patch /ext-src
COPY patches/pg_hintplan.patch /ext-src
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
COPY patches/pg_cron.patch /ext-src
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
# cmake is required for the h3 test
RUN apt-get update && apt-get install -y cmake
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
RUN patch -p1 < /ext-src/pg_hintplan.patch
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
RUN patch -p1 </ext-src/pg_anon.patch
RUN patch -p1 </ext-src/pg_cron.patch

View File

@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
#### Running neon database
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
testing locally, it is convenient to run just one set of permutations, like this:
```sh
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
```
## Flamegraphs

View File

@@ -44,7 +44,6 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
use tracing::{error, info, warn};
@@ -367,8 +366,6 @@ fn wait_spec(
state.start_time = now;
}
launch_lsn_lease_bg_task_for_static(&compute);
Ok(WaitSpecResult {
compute,
http_port,

View File

@@ -11,7 +11,6 @@ pub mod logger;
pub mod catalog;
pub mod compute;
pub mod extension_server;
pub mod lsn_lease;
mod migration;
pub mod monitor;
pub mod params;

View File

@@ -1,186 +0,0 @@
use anyhow::bail;
use anyhow::Result;
use postgres::{NoTls, SimpleQueryMessage};
use std::time::SystemTime;
use std::{str::FromStr, sync::Arc, thread, time::Duration};
use utils::id::TenantId;
use utils::id::TimelineId;
use compute_api::spec::ComputeMode;
use tracing::{info, warn};
use utils::{
lsn::Lsn,
shard::{ShardCount, ShardNumber, TenantShardId},
};
use crate::compute::ComputeNode;
/// Spawns a background thread to periodically renew LSN leases for static compute.
/// Do nothing if the compute is not in static mode.
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
let (tenant_id, timeline_id, lsn) = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("Spec must be set");
match spec.spec.mode {
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
_ => return,
}
};
let compute = compute.clone();
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
thread::spawn(move || {
let _entered = span.entered();
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
// TODO: might need stronger error feedback than logging an warning.
warn!("Exited with error: {e}");
}
});
}
/// Renews lsn lease periodically so static compute are not affected by GC.
fn lsn_lease_bg_task(
compute: Arc<ComputeNode>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<()> {
loop {
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
let valid_duration = valid_until
.duration_since(SystemTime::now())
.unwrap_or(Duration::ZERO);
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
let sleep_duration = valid_duration
.saturating_sub(Duration::from_secs(60))
.max(valid_duration / 2);
info!(
"Succeeded, sleeping for {} seconds",
sleep_duration.as_secs()
);
thread::sleep(sleep_duration);
}
}
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
fn acquire_lsn_lease_with_retry(
compute: &Arc<ComputeNode>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<SystemTime> {
let mut attempts = 0usize;
let mut retry_period_ms: f64 = 500.0;
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
loop {
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
let configs = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("spec must be set");
let conn_strings = spec.pageserver_connstr.split(',');
conn_strings
.map(|connstr| {
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
if let Some(storage_auth_token) = &spec.storage_auth_token {
info!("Got storage auth token from spec file");
config.password(storage_auth_token.clone());
} else {
info!("Storage auth token not set");
}
config
})
.collect::<Vec<_>>()
};
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
match result {
Ok(Some(res)) => {
return Ok(res);
}
Ok(None) => {
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
}
Err(e) => {
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
thread::sleep(Duration::from_millis(retry_period_ms as u64));
retry_period_ms *= 1.5;
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
}
}
attempts += 1;
}
}
/// Tries to acquire an LSN lease through PS page_service API.
fn try_acquire_lsn_lease(
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
configs: &[postgres::Config],
) -> Result<Option<SystemTime>> {
fn get_valid_until(
config: &postgres::Config,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let mut client = config.connect(NoTls)?;
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
let res = client.simple_query(&cmd)?;
let msg = match res.first() {
Some(msg) => msg,
None => bail!("empty response"),
};
let row = match msg {
SimpleQueryMessage::Row(row) => row,
_ => bail!("error parsing lsn lease response"),
};
// Note: this will be None if a lease is explicitly not granted.
let valid_until_str = row.get("valid_until");
let valid_until = valid_until_str.map(|s| {
SystemTime::UNIX_EPOCH
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
.expect("Time larger than max SystemTime could handle")
});
Ok(valid_until)
}
let shard_count = configs.len();
let valid_until = if shard_count > 1 {
configs
.iter()
.enumerate()
.map(|(shard_number, config)| {
let tenant_shard_id = TenantShardId {
tenant_id,
shard_count: ShardCount::new(shard_count as u8),
shard_number: ShardNumber(shard_number as u8),
};
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
})
.collect::<Result<Vec<Option<SystemTime>>>>()?
.into_iter()
.min()
.unwrap()
} else {
get_valid_until(
&configs[0],
TenantShardId::unsharded(tenant_id),
timeline_id,
lsn,
)?
};
Ok(valid_until)
}

View File

@@ -6,6 +6,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true

View File

@@ -379,7 +379,7 @@ where
}
}
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting
Ok(_) => Ok(false),

View File

@@ -15,9 +15,7 @@ use control_plane::local_env::{
};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::{
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
};
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::config::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -54,7 +52,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
const DEFAULT_BRANCH_NAME: &str = "main";
project_git_version!(GIT_VERSION);
const DEFAULT_PG_VERSION: &str = "16";
const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
humantime_duration.as_ref()
}
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
let maybe_instance_id = args.get_one::<u8>("instance-id");
let base_port = args.get_one::<u16>("base-port");
if maybe_instance_id.is_some() && base_port.is_none() {
panic!("storage-controller start specificied instance-id but did not provide base-port");
}
let start_timeout = args
.get_one::<humantime::Duration>("start-timeout")
.expect("invalid value for start-timeout");
NeonStorageControllerStartArgs {
instance_id: maybe_instance_id.copied().unwrap_or(1),
base_port: base_port.copied(),
start_timeout: *start_timeout,
}
}
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
let maybe_instance_id = args.get_one::<u8>("instance-id");
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
NeonStorageControllerStopArgs {
instance_id: maybe_instance_id.copied().unwrap_or(1),
immediate,
}
}
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
let svc = StorageController::from_env(env);
match sub_match.subcommand() {
Some(("start", start_match)) => {
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
eprintln!("start failed: {e}");
exit(1);
}
}
Some(("stop", stop_match)) => {
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
let immediate = stop_match
.get_one::<String>("stop-mode")
.map(|s| s.as_str())
== Some("immediate");
if let Err(e) = svc.stop(immediate).await {
eprintln!("stop failed: {}", e);
exit(1);
}
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller
.start(NeonStorageControllerStartArgs::with_default_instance_id(
(*retry_timeout).into(),
))
.await
{
if let Err(e) = storage_controller.start(retry_timeout).await {
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
eprintln!("neon broker stop failed: {e:#}");
}
// Stop all storage controller instances. In the most common case there's only one,
// but iterate though the base data directory in order to discover the instances.
let storcon_instances = env
.storage_controller_instances()
.await
.expect("Must inspect data dir");
for (instance_id, _instance_dir_path) in storcon_instances {
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
let stop_args = NeonStorageControllerStopArgs {
instance_id,
immediate,
};
if let Err(e) = storage_controller.stop(stop_args).await {
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
if let Err(e) = storage_controller.stop(immediate).await {
eprintln!("storage controller stop failed: {e:#}");
}
}
}
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
.action(ArgAction::SetTrue)
.required(false);
let instance_id = Arg::new("instance-id")
.long("instance-id")
.help("Identifier used to distinguish storage controller instances (default 1)")
.value_parser(value_parser!(u8))
.required(false);
let base_port = Arg::new("base-port")
.long("base-port")
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
.value_parser(value_parser!(u16))
.required(false);
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller")
.arg(timeout_arg.clone())
.arg(instance_id.clone())
.arg(base_port))
.arg(timeout_arg.clone()))
.subcommand(Command::new("stop").about("Stop storage controller")
.arg(stop_mode_arg.clone())
.arg(instance_id))
.arg(stop_mode_arg.clone()))
)
.subcommand(
Command::new("safekeeper")

View File

@@ -824,12 +824,11 @@ impl Endpoint {
// cleanup work to do after postgres stops, like syncing safekeepers,
// etc.
//
// If destroying or stop mode is immediate, send it SIGTERM before
// waiting. Sometimes we do *not* want this cleanup: tests intentionally
// do stop when majority of safekeepers is down, so sync-safekeepers
// would hang otherwise. This could be a separate flag though.
let send_sigterm = destroy || mode == "immediate";
self.wait_for_compute_ctl_to_exit(send_sigterm)?;
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
// want this cleanup: tests intentionally do stop when majority of
// safekeepers is down, so sync-safekeepers would hang otherwise. This
// could be a separate flag though.
self.wait_for_compute_ctl_to_exit(destroy)?;
if destroy {
println!(
"Destroying postgres data directory '{}'",

View File

@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 16;
pub const DEFAULT_PG_VERSION: u32 = 15;
//
// This data structures represents neon_local CLI config
@@ -156,18 +156,10 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub max_warming_up: Duration,
pub start_as_candidate: bool,
/// Database url used when running multiple storage controller instances
pub database_url: Option<SocketAddr>,
/// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>,
pub max_secondary_lag_bytes: Option<u64>,
#[serde(with = "humantime_serde")]
pub heartbeat_interval: Duration,
}
impl NeonStorageControllerConf {
@@ -175,9 +167,6 @@ impl NeonStorageControllerConf {
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
// Very tight heartbeat interval to speed up tests
const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
}
impl Default for NeonStorageControllerConf {
@@ -185,11 +174,8 @@ impl Default for NeonStorageControllerConf {
Self {
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
start_as_candidate: false,
database_url: None,
split_threshold: None,
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
}
}
}
@@ -406,36 +392,6 @@ impl LocalEnv {
}
}
/// Inspect the base data directory and extract the instance id and instance directory path
/// for all storage controller instances
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
let mut instances = Vec::default();
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
for dentry in dir {
let dentry = dentry?;
let is_dir = dentry.metadata()?.is_dir();
let filename = dentry.file_name().into_string().unwrap();
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
Some(suffix) => suffix.parse::<u8>().ok(),
None => None,
};
let is_instance_dir = is_dir && parsed_instance_id.is_some();
if !is_instance_dir {
continue;
}
instances.push((
parsed_instance_id.expect("Checked previously"),
dentry.path(),
));
}
Ok(instances)
}
pub fn register_branch_mapping(
&mut self,
branch_name: String,

View File

@@ -181,23 +181,6 @@ impl PageServerNode {
);
io::stdout().flush()?;
// If the config file we got as a CLI argument includes the `availability_zone`
// config, then use that to populate the `metadata.json` file for the pageserver.
// In production the deployment orchestrator does this for us.
let az_id = conf
.other
.get("availability_zone")
.map(|toml| {
let az_str = toml.to_string();
// Trim the (") chars from the toml representation
if az_str.starts_with('"') && az_str.ends_with('"') {
az_str[1..az_str.len() - 1].to_string()
} else {
az_str
}
})
.unwrap_or("local".to_string());
let config = self
.pageserver_init_make_toml(conf)
.context("make pageserver toml")?;
@@ -233,7 +216,6 @@ impl PageServerNode {
let (_http_host, http_port) =
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
let http_port = http_port.unwrap_or(9898);
// Intentionally hand-craft JSON: this acts as an implicit format compat test
// in case the pageserver-side structure is edited, and reflects the real life
// situation: the metadata is written by some other script.
@@ -244,10 +226,7 @@ impl PageServerNode {
postgres_port: self.pg_connection_config.port(),
http_host: "localhost".to_string(),
http_port,
other: HashMap::from([(
"availability_zone_id".to_string(),
serde_json::json!(az_id),
)]),
other: HashMap::new(),
})
.unwrap(),
)

View File

@@ -5,7 +5,6 @@
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
use std::future::Future;
use std::io::Write;
use std::path::PathBuf;
use std::time::Duration;
@@ -35,10 +34,12 @@ pub enum SafekeeperHttpError {
type Result<T> = result::Result<T, SafekeeperHttpError>;
pub(crate) trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
#[async_trait::async_trait]
pub trait ResponseErrorMessageExt: Sized {
async fn error_from_body(self) -> Result<Self>;
}
#[async_trait::async_trait]
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(self) -> Result<Self> {
let status = self.status();

View File

@@ -3,8 +3,6 @@ use crate::{
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Uri;
use nix::unistd::Pid;
use pageserver_api::{
controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
use std::{fs, str::FromStr, time::Duration};
use tokio::process::Command;
use tracing::instrument;
use url::Url;
@@ -31,14 +29,12 @@ use utils::{
pub struct StorageController {
env: LocalEnv,
listen: String,
private_key: Option<Vec<u8>>,
public_key: Option<String>,
postgres_port: u16,
client: reqwest::Client,
config: NeonStorageControllerConf,
// The listen addresses is learned when starting the storage controller,
// hence the use of OnceLock to init it at the right time.
listen: OnceLock<SocketAddr>,
}
const COMMAND: &str = "storage_controller";
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
const DB_NAME: &str = "storage_controller";
pub struct NeonStorageControllerStartArgs {
pub instance_id: u8,
pub base_port: Option<u16>,
pub start_timeout: humantime::Duration,
}
impl NeonStorageControllerStartArgs {
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
Self {
instance_id: 1,
base_port: None,
start_timeout,
}
}
}
pub struct NeonStorageControllerStopArgs {
pub instance_id: u8,
pub immediate: bool,
}
impl NeonStorageControllerStopArgs {
pub fn with_default_instance_id(immediate: bool) -> Self {
Self {
instance_id: 1,
immediate,
}
}
}
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -101,6 +67,23 @@ pub struct InspectResponse {
impl StorageController {
pub fn from_env(env: &LocalEnv) -> Self {
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
let listen_url = env.control_plane_api.clone().unwrap();
let listen = format!(
"{}:{}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
);
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
// port, for use by our captive postgres.
let postgres_port = listen_url
.port()
.expect("Control plane API setting should always have a port")
+ 1;
// Assume all pageservers have symmetric auth configuration: this service
// expects to use one JWT token to talk to all of them.
let ps_conf = env
@@ -143,28 +126,20 @@ impl StorageController {
Self {
env: env.clone(),
listen,
private_key,
public_key,
postgres_port,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
config: env.storage_controller.clone(),
listen: OnceLock::default(),
}
}
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
self.env
.base_data_dir
.join(format!("storage_controller_{}", instance_id))
}
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(
self.storage_controller_instance_dir(instance_id)
.join("storage_controller.pid"),
)
.expect("non-Unicode path")
fn pid_file(&self) -> Utf8PathBuf {
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
.expect("non-Unicode path")
}
/// PIDFile for the postgres instance used to store storage controller state
@@ -209,23 +184,23 @@ impl StorageController {
}
/// Readiness check for our postgres process
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
let bin_path = pg_bin_dir.join("pg_isready");
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
Ok(exitcode.success())
}
/// Create our database if it doesn't exist
/// Create our database if it doesn't exist, and run migrations.
///
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
/// who just want to run `cargo neon_local` without knowing about diesel.
///
/// Returns the database url
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
pub async fn setup_database(&self) -> anyhow::Result<String> {
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
let createdb_path = pg_bin_dir.join("createdb");
@@ -234,7 +209,7 @@ impl StorageController {
"-h",
"localhost",
"-p",
&format!("{}", postgres_port),
&format!("{}", self.postgres_port),
DB_NAME,
])
.output()
@@ -255,14 +230,13 @@ impl StorageController {
pub async fn connect_to_database(
&self,
postgres_port: u16,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
tokio_postgres::Config::new()
.host("localhost")
.port(postgres_port)
.port(self.postgres_port)
// The user is the ambient operating system user name.
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
//
@@ -278,114 +252,72 @@ impl StorageController {
.map_err(anyhow::Error::new)
}
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
if err.kind() != std::io::ErrorKind::AlreadyExists {
panic!("Failed to create instance dir {instance_dir:?}");
}
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_lib_dir = self.get_pg_lib_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
let (listen, postgres_port) = {
if let Some(base_port) = start_args.base_port {
(
format!("127.0.0.1:{base_port}"),
self.config
.database_url
.expect("--base-port requires NeonStorageControllerConf::database_url")
.port(),
)
} else {
let listen_url = self.env.control_plane_api.clone().unwrap();
let listen = format!(
"{}:{}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
);
(listen, listen_url.port().unwrap() + 1)
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.envs(vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
};
let socket_addr = listen
.parse()
.expect("listen address is a valid socket address");
self.listen
.set(socket_addr)
.expect("StorageController::listen is only set here");
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
// Do we remove the pid file on stop?
let pg_started = self.is_postgres_running().await?;
let pg_lib_dir = self.get_pg_lib_dir().await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
if !pg_started {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_log_path = pg_data_path.join("postgres.log");
background_process::start_process(
"storage_controller_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
retry_timeout,
|| self.pg_isready(&pg_bin_dir),
)
.await?;
if !tokio::fs::try_exists(&pg_data_path).await? {
// Initialize empty database
let initdb_path = pg_bin_dir.join("initdb");
let mut child = Command::new(&initdb_path)
.envs(vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
])
.args(["-D", pg_data_path.as_ref()])
.spawn()
.expect("Failed to spawn initdb");
let status = child.wait().await?;
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
};
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", postgres_port),
)
.await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
"-D",
pg_data_path.as_ref(),
"-l",
pg_log_path.as_ref(),
"start",
];
background_process::start_process(
"storage_controller_db",
&self.env.base_data_dir,
pg_bin_dir.join("pg_ctl").as_std_path(),
db_start_args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
&start_args.start_timeout,
|| self.pg_isready(&pg_bin_dir, postgres_port),
)
.await?;
self.setup_database(postgres_port).await?;
}
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
// We support running a startup SQL script to fiddle with the database before we launch storcon.
// This is used by the test suite.
@@ -407,7 +339,7 @@ impl StorageController {
}
}
};
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
let (mut client, conn) = self.connect_to_database().await?;
let conn = tokio::spawn(conn);
let tx = client.build_transaction();
let tx = tx.start().await?;
@@ -416,20 +348,9 @@ impl StorageController {
drop(client);
conn.await??;
let listen = self
.listen
.get()
.expect("cell is set earlier in this function");
let address_for_peers = Uri::builder()
.scheme("http")
.authority(format!("{}:{}", listen.ip(), listen.port()))
.path_and_query("")
.build()
.unwrap();
let mut args = vec![
"-l",
&listen.to_string(),
&self.listen,
"--dev",
"--database-url",
&database_url,
@@ -437,29 +358,15 @@ impl StorageController {
&humantime::Duration::from(self.config.max_offline).to_string(),
"--max-warming-up-interval",
&humantime::Duration::from(self.config.max_warming_up).to_string(),
"--heartbeat-interval",
&humantime::Duration::from(self.config.heartbeat_interval).to_string(),
"--address-for-peers",
&address_for_peers.to_string(),
]
.into_iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
if self.config.start_as_candidate {
args.push("--start-as-candidate".to_string());
}
if let Some(private_key) = &self.private_key {
let claims = Claims::new(None, Scope::PageServerApi);
let jwt_token =
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
args.push(format!("--jwt-token={jwt_token}"));
let peer_claims = Claims::new(None, Scope::Admin);
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
.expect("failed to generate jwt token");
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
}
if let Some(public_key) = &self.public_key {
@@ -487,15 +394,15 @@ impl StorageController {
background_process::start_process(
COMMAND,
&instance_dir,
&self.env.base_data_dir,
&self.env.storage_controller_bin(),
args,
vec![
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
],
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
&start_args.start_timeout,
background_process::InitialPidFile::Create(self.pid_file()),
retry_timeout,
|| async {
match self.ready().await {
Ok(_) => Ok(true),
@@ -508,35 +415,8 @@ impl StorageController {
Ok(())
}
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
background_process::stop_process(
stop_args.immediate,
COMMAND,
&self.pid_file(stop_args.instance_id),
)?;
let storcon_instances = self.env.storage_controller_instances().await?;
for (instance_id, instanced_dir_path) in storcon_instances {
if instance_id == stop_args.instance_id {
continue;
}
let pid_file = instanced_dir_path.join("storage_controller.pid");
let pid = tokio::fs::read_to_string(&pid_file)
.await
.map_err(|err| {
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
})?
.parse::<i32>()
.expect("pid is valid i32");
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
if other_proc_alive {
// There is another storage controller instance running, so we return
// and leave the database running.
return Ok(());
}
}
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -549,51 +429,27 @@ impl StorageController {
.wait()
.await?;
if !stop_status.success() {
match self.is_postgres_running().await {
Ok(false) => {
println!("Storage controller database is already stopped");
return Ok(());
}
Ok(true) => {
anyhow::bail!("Failed to stop storage controller database");
}
Err(err) => {
anyhow::bail!("Failed to stop storage controller database: {err}");
}
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
println!("Storage controller database is already stopped");
return Ok(());
} else {
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
}
}
Ok(())
}
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
let pg_bin_dir = self.get_pg_bin_dir().await?;
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
.args(pg_status_args)
.spawn()?
.wait()
.await?;
// pg_ctl status returns this exit code if postgres is not running: in this case it is
// fine that stop failed. Otherwise it is an error that stop failed.
const PG_STATUS_NOT_RUNNING: i32 = 3;
const PG_NO_DATA_DIR: i32 = 4;
const PG_STATUS_RUNNING: i32 = 0;
match status_exitcode.code() {
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
Some(PG_NO_DATA_DIR) => Ok(false),
Some(PG_STATUS_RUNNING) => Ok(true),
Some(code) => Err(anyhow::anyhow!(
"pg_ctl status returned unexpected status code: {:?}",
code
)),
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
}
}
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
let category = match path.find('/') {
Some(idx) => &path[..idx],
@@ -619,31 +475,15 @@ impl StorageController {
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// In the special case of the `storage_controller start` subcommand, we wish
// to use the API endpoint of the newly started storage controller in order
// to pass the readiness check. In this scenario [`Self::listen`] will be set
// (see [`Self::start`]).
//
// Otherwise, we infer the storage controller api endpoint from the configured
// control plane API.
let url = if let Some(socket_addr) = self.listen.get() {
Url::from_str(&format!(
"http://{}:{}/{path}",
socket_addr.ip().to_canonical(),
socket_addr.port()
))
.unwrap()
} else {
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap()
};
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let listen_url = self.env.control_plane_api.clone().unwrap();
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
listen_url.host_str().unwrap(),
listen_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {

View File

@@ -41,8 +41,6 @@ enum Command {
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
#[arg(long)]
availability_zone_id: String,
},
/// Modify a node's configuration in the storage controller
@@ -149,9 +147,9 @@ enum Command {
#[arg(long)]
threshold: humantime::Duration,
},
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
// Drain a set of specified pageservers by moving the primary attachments to pageservers
// outside of the specified set.
BulkMigrate {
Drain {
// Set of pageserver node ids to drain.
#[arg(long)]
nodes: Vec<NodeId>,
@@ -165,34 +163,6 @@ enum Command {
#[arg(long)]
dry_run: Option<bool>,
},
/// Start draining the specified pageserver.
/// The drain is complete when the schedulling policy returns to active.
StartDrain {
#[arg(long)]
node_id: NodeId,
},
/// Cancel draining the specified pageserver and wait for `timeout`
/// for the operation to be canceled. May be retried.
CancelDrain {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
timeout: humantime::Duration,
},
/// Start filling the specified pageserver.
/// The drain is complete when the schedulling policy returns to active.
StartFill {
#[arg(long)]
node_id: NodeId,
},
/// Cancel filling the specified pageserver and wait for `timeout`
/// for the operation to be canceled. May be retried.
CancelFill {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
timeout: humantime::Duration,
},
}
#[derive(Parser)]
@@ -279,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
}
}
async fn wait_for_scheduling_policy<F>(
client: Client,
node_id: NodeId,
timeout: Duration,
f: F,
) -> anyhow::Result<NodeSchedulingPolicy>
where
F: Fn(NodeSchedulingPolicy) -> bool,
{
let waiter = tokio::time::timeout(timeout, async move {
loop {
let node = client
.dispatch::<(), NodeDescribeResponse>(
Method::GET,
format!("control/v1/node/{node_id}"),
None,
)
.await?;
if f(node.scheduling) {
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
}
}
});
Ok(waiter.await??)
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
@@ -324,7 +266,6 @@ async fn main() -> anyhow::Result<()> {
listen_pg_port,
listen_http_addr,
listen_http_port,
availability_zone_id,
} => {
storcon_client
.dispatch::<_, ()>(
@@ -336,7 +277,6 @@ async fn main() -> anyhow::Result<()> {
listen_pg_port,
listen_http_addr,
listen_http_port,
availability_zone_id: Some(availability_zone_id),
}),
)
.await?;
@@ -682,13 +622,12 @@ async fn main() -> anyhow::Result<()> {
threshold: threshold.into(),
},
)),
heatmap_period: Some("300s".to_string()),
..Default::default()
},
})
.await?;
}
Command::BulkMigrate {
Command::Drain {
nodes,
concurrency,
max_shards,
@@ -717,7 +656,7 @@ async fn main() -> anyhow::Result<()> {
}
if nodes.len() != node_to_drain_descs.len() {
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
anyhow::bail!("Drain requested for node which doesn't exist.")
}
node_to_fill_descs.retain(|desc| {
@@ -729,7 +668,7 @@ async fn main() -> anyhow::Result<()> {
});
if node_to_fill_descs.is_empty() {
anyhow::bail!("There are no nodes to migrate to")
anyhow::bail!("There are no nodes to drain to")
}
// Set the node scheduling policy to draining for the nodes which
@@ -750,7 +689,7 @@ async fn main() -> anyhow::Result<()> {
.await?;
}
// Perform the migration: move each tenant shard scheduled on a node to
// Perform the drain: move each tenant shard scheduled on a node to
// be drained to a node which is being filled. A simple round robin
// strategy is used to pick the new node.
let tenants = storcon_client
@@ -763,13 +702,13 @@ async fn main() -> anyhow::Result<()> {
let mut selected_node_idx = 0;
struct MigrationMove {
struct DrainMove {
tenant_shard_id: TenantShardId,
from: NodeId,
to: NodeId,
}
let mut moves: Vec<MigrationMove> = Vec::new();
let mut moves: Vec<DrainMove> = Vec::new();
let shards = tenants
.into_iter()
@@ -799,7 +738,7 @@ async fn main() -> anyhow::Result<()> {
continue;
}
moves.push(MigrationMove {
moves.push(DrainMove {
tenant_shard_id: shard.tenant_shard_id,
from: shard
.node_attached
@@ -876,67 +815,6 @@ async fn main() -> anyhow::Result<()> {
failure
);
}
Command::StartDrain { node_id } => {
storcon_client
.dispatch::<(), ()>(
Method::PUT,
format!("control/v1/node/{node_id}/drain"),
None,
)
.await?;
println!("Drain started for {node_id}");
}
Command::CancelDrain { node_id, timeout } => {
storcon_client
.dispatch::<(), ()>(
Method::DELETE,
format!("control/v1/node/{node_id}/drain"),
None,
)
.await?;
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
let final_policy =
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
use NodeSchedulingPolicy::*;
matches!(sched, Active | PauseForRestart)
})
.await?;
println!(
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
);
}
Command::StartFill { node_id } => {
storcon_client
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
.await?;
println!("Fill started for {node_id}");
}
Command::CancelFill { node_id, timeout } => {
storcon_client
.dispatch::<(), ()>(
Method::DELETE,
format!("control/v1/node/{node_id}/fill"),
None,
)
.await?;
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
let final_policy =
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
use NodeSchedulingPolicy::*;
matches!(sched, Active)
})
.await?;
println!(
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
);
}
}
Ok(())

View File

@@ -3,7 +3,7 @@ set -x
cd /ext-src || exit 2
FAILED=
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
for d in ${LIST}
do
[ -d "${d}" ] || continue

View File

@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
during the restart at 2024-04-03 16:37 UTC.
Note that lots of shutdowns on loaded pageservers do not finish within the
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
This problem is not yet very acutely felt in storage controller managed pageservers since

View File

@@ -1,495 +0,0 @@
# Safekeeper dynamic membership change
To quickly recover from safekeeper node failures and do rebalancing we need to
be able to change set of safekeepers the timeline resides on. The procedure must
be safe (not lose committed log) regardless of safekeepers and compute state. It
should be able to progress if any majority of old safekeeper set, any majority
of new safekeeper set and compute are up and connected. This is known as a
consensus membership change. It always involves two phases: 1) switch old
majority to old + new configuration, preventing commits without acknowledge from
the new set 2) bootstrap the new set by ensuring majority of the new set has all
data which ever could have been committed before the first phase completed;
after that switch is safe to finish. Without two phases switch to the new set
which quorum might not intersect with quorum of the old set (and typical case of
ABC -> ABD switch is an example of that, because quorums AC and BD don't
intersect). Furthermore, procedure is typically carried out by the consensus
leader, and so enumeration of configurations which establishes order between
them is done through consensus log.
In our case consensus leader is compute (walproposer), and we don't want to wake
up all computes for the change. Neither we want to fully reimplement the leader
logic second time outside compute. Because of that the proposed algorithm relies
for issuing configurations on the external fault tolerant (distributed) strongly
consisent storage with simple API: CAS (compare-and-swap) on the single key.
Properly configured postgres suits this.
In the system consensus is implemented at the timeline level, so algorithm below
applies to the single timeline.
## Algorithm
### Definitions
A configuration is
```
struct Configuration {
generation: Generation, // a number uniquely identifying configuration
sk_set: Vec<NodeId>, // current safekeeper set
new_sk_set: Optional<Vec<NodeId>>,
}
```
Configuration with `new_set` present is used for the intermediate step during
the change and called joint configuration. Generations establish order of
generations: we say `c1` is higher than `c2` if `c1.generation` >
`c2.generation`.
### Persistently stored data changes
Safekeeper starts storing its current configuration in the control file. Update
of is atomic, so in-memory value always matches the persistent one.
External CAS providing storage (let's call it configuration storage here) also
stores configuration for each timeline. It is initialized with generation 1 and
initial set of safekeepers during timeline creation. Executed CAS on it must
never be lost.
### Compute <-> safekeeper protocol changes
`ProposerGreeting` message carries walproposer's configuration if it is already
established (see below), else null. `AcceptorGreeting` message carries
safekeeper's current `Configuration`. All further messages (`VoteRequest`,
`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
generation number, of walproposer in case of wp->sk message or of safekeeper in
case of sk->wp message.
### Safekeeper changes
Basic rule: once safekeeper observes configuration higher than his own it
immediately switches to it. It must refuse all messages with lower generation
that his. It also refuses messages if it is not member of the current generation
(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
process them (walproposer should ignore result anyway).
If there is non null configuration in `ProposerGreeting` and it is higher than
current safekeeper one, safekeeper switches to it.
Safekeeper sends its current configuration in its first message to walproposer
`AcceptorGreeting`. It refuses all other walproposer messages if the
configuration generation in them is less than its current one. Namely, it
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
response it sends its current configuration generation to let walproposer know.
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
current one and ignores it otherwise. In any case it replies with
```
struct ConfigurationSwitchResponse {
conf: Configuration,
term: Term,
last_log_term: Term,
flush_lsn: Lsn,
}
```
### Compute (walproposer) changes
Basic rule is that joint configuration requires votes from majorities in the
both `set` and `new_sk_set`.
Compute receives list of safekeepers to connect to from the control plane as
currently and tries to communicate with all of them. However, the list does not
define consensus members. Instead, on start walproposer tracks highest
configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
establishes this configuration as its own and moves to voting.
It should stop talking to safekeepers not listed in the configuration at this
point, though it is not unsafe to continue doing so.
To be elected it must receive votes from both majorites if `new_sk_set` is present.
Similarly, to commit WAL it must receive flush acknowledge from both majorities.
If walproposer hears from safekeeper configuration higher than his own (i.e.
refusal to accept due to configuration change) it simply restarts.
### Change algorithm
The following algorithm can be executed anywhere having access to configuration
storage and safekeepers. It is safe to interrupt / restart it and run multiple
instances of it concurrently, though likely one of them won't make
progress then. It accepts `desired_set: Vec<NodeId>` as input.
Algorithm will refuse to make the change if it encounters previous interrupted
change attempt, but in this case it will try to finish it.
It will eventually converge if old majority, new majority and configuration
storage are reachable.
1) Fetch current timeline configuration from the configuration storage.
2) If it is already joint one and `new_set` is different from `desired_set`
refuse to change. However, assign join conf to (in memory) var
`join_conf` and proceed to step 4 to finish the ongoing change.
3) Else, create joint `joint_conf: Configuration`: increment current conf number
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
storage by doing CAS on the current generation: change happens only if
current configuration number is still `n`. Apart from guaranteeing uniqueness
of configurations, CAS linearizes them, ensuring that new configuration is
created only following the previous one when we know that the transition is
safe. Failed CAS aborts the procedure.
4) Call `PUT` `configuration` on safekeepers from the current set,
delivering them `joint_conf`. Collecting responses from majority is required
to proceed. If any response returned generation higher than
`joint_conf.generation`, abort (another switch raced us). Otherwise, choose
max `<last_log_term, flush_lsn>` among responses and establish it as
(in memory) `sync_position`. Also choose max `term` and establish it as (in
memory) `sync_term`. We can't finish the switch until majority of the new set
catches up to this `sync_position` because data before it could be committed
without ack from the new set. Similarly, we'll bump term on new majority
to `sync_term` so that two computes with the same term are never elected.
4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
doesn't exist yet by doing `pull_timeline` from the majority of the
current set. Doing that on majority of `new_sk_set` is enough to
proceed, but it is reasonable to ensure that all `new_sk_set` members
are initialized -- if some of them are down why are we migrating there?
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
Success on majority is enough.
6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
delivering them `joint_conf` and collecting their positions. This will
switch them to the `joint_conf` which generally won't be needed
because `pull_timeline` already includes it and plus additionally would be
broadcast by compute. More importantly, we may proceed to the next step
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
`sync_position`. Similarly, on the happy path no waiting is not needed because
`pull_timeline` already includes it. However, we should double
check to be safe. For example, timeline could have been created earlier e.g.
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
storage under one more CAS.
8) Call `PUT` `configuration` on safekeepers from the new set,
delivering them `new_conf`. It is enough to deliver it to the majority
of the new set; the rest can be updated by compute.
I haven't put huge effort to make the description above very precise, because it
is natural language prone to interpretations anyway. Instead I'd like to make TLA+
spec of it.
Description above focuses on safety. To make the flow practical and live, here a few more
considerations.
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
step 3.
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
it is safe to rollback to the old conf with one more CAS.
3) On step 4 timeline might be already created on members of the new set for various reasons;
the simplest is the procedure restart. There are more complicated scenarious like mentioned
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
generations, so seems simpler to treat existing timeline as success. However, this also
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
I don't think we'll observe this in practice, but can add waking up compute if needed.
4) In the end timeline should be locally deleted on the safekeeper(s) which are
in the old set but not in the new one, unless they are unreachable. To be
safe this also should be done under generation number (deletion proceeds only if
current configuration is <= than one in request and safekeeper is not memeber of it).
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
jump to step 7, using it as `new_conf`.
## Implementation
The procedure ought to be driven from somewhere. Obvious candidates are control
plane and storage_controller; and as each of them already has db we don't want
yet another storage. I propose to manage safekeepers in storage_controller
because 1) since it is in rust it simplifies simulation testing (more on this
below) 2) it already manages pageservers.
This assumes that migration will be fully usable only after we migrate all
tenants/timelines to storage_controller. It is discussible whether we want also
to manage pageserver attachments for all of these, but likely we do.
This requires us to define storcon <-> cplane interface.
### storage_controller <-> control plane interface
First of all, control plane should
[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
storing safekeepers per timeline instead of per tenant because we can't migrate
tenants atomically.
The important question is how updated configuration is delivered from
storage_controller to control plane to provide it to computes. As always, there
are two options, pull and push. Let's do it the same push as with pageserver
`/notify-attach` because 1) it keeps storage_controller out of critical compute
start path 2) provides easier upgrade: there won't be such a thing as 'timeline
managed by control plane / storcon', cplane just takes the value out of its db
when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
control plane until it succeeds.
So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
updates it in the db if the provided conf generation is higher (the cplane db
should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
should update db which makes the call successful, and then try to schedule
`apply_config` if possible, it is ok if not. storage_controller
should rate limit calling the endpoint, but likely this won't be needed, as migration
throughput is limited by `pull_timeline`.
Timeline (branch) creation in cplane should call storage_controller POST
`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
Response should be augmented with `safekeeper_conf: Configuration`. The call
should be retried until succeeds.
Timeline deletion and tenant deletion in cplane should call appropriate
storage_controller endpoints like it currently does for sharded tenants. The
calls should be retried until they succeed.
### storage_controller implementation
Current 'load everything on startup and keep in memory' easy design is fine.
Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
10^6 of timelines shouldn't take more than 100MB.
Similar to pageserver attachment Intents storage_controller would have in-memory
`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
to make these request reality; this ensures one instance of storage_controller
won't do several migrations on the same timeline concurrently. In the first
version it is simpler to have more manual control and no retries, i.e. migration
failure removes the request. Later we can build retries and automatic
scheduling/migration. `MigrationRequest` is
```
enum MigrationRequest {
To(Vec<NodeId>),
FinishPending,
}
```
`FinishPending` requests to run the procedure to ensure state is clean: current
configuration is not joint and majority of safekeepers are aware of it, but do
not attempt to migrate anywhere. If current configuration fetched on step 1 is
not joint it jumps to step 7. It should be run at startup for all timelines (but
similarly, in the first version it is ok to trigger it manually).
#### Schema
`safekeepers` table mirroring current `nodes` should be added, except that for
`scheduling_policy` field (seems like `status` is a better name for it): it is enough
to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
`decomissioned`.
`timelines` table:
```
table! {
// timeline_id is primary key
timelines (tenant_id, timeline_id) {
timeline_id -> Varchar,
tenant_id -> Varchar,
generation -> Int4,
sk_set -> Array<Int4>, // list of safekeeper ids
new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
cplane_notified_generation -> Int4,
}
}
```
#### API
Node management is similar to pageserver:
1) POST `/control/v1/safekeepers` upserts safekeeper.
2) GET `/control/v1/safekeepers` lists safekeepers.
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
`offline` or `decomissioned`. Initially it is simpler not to schedule any
migrations here.
Safekeeper deploy scripts should register safekeeper at storage_contorller as
they currently do with cplane, under the same id.
Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
would 1) choose initial set of safekeepers; 2) write to the db initial
`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
case of conflict; 3) create timeline on the majority of safekeepers (already
created is ok).
We don't want to block timeline creation when one safekeeper is down. Currently
this is solved by compute implicitly creating timeline on any safekeeper it is
connected to. This creates ugly timeline state on safekeeper when timeline is
created, but start LSN is not defined yet. It would be nice to remove this; to
do that, controller can in the background retry to create timeline on
safekeeper(s) which missed that during initial creation call. It can do that
through `pull_timeline` from majority so it doesn't need to remember
`parent_lsn` in its db.
Timeline deletion removes the row from the db and forwards deletion to the
current configuration members. Without additional actions deletions might leak,
see below on this; initially let's ignore these, reporting to cplane success if
at least one safekeeper deleted the timeline (this will remove s3 data).
Tenant deletion repeats timeline deletion for all timelines.
Migration API: the first version is the simplest and the most imperative:
1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
all timelines from one safekeeper to another. It accepts json
```
{
"src_sk": u32,
"dst_sk": u32,
"limit": Optional<u32>,
}
```
Returns list of scheduled requests.
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
to move single timeline to given set of safekeepers:
```
{
"desired_set": Vec<u32>,
}
```
Returns scheduled request.
Similar call should be added for the tenant.
It would be great to have some way of subscribing to the results (apart from
looking at logs/metrics).
Migration is executed as described above. One subtlety is that (local) deletion on
source safekeeper might fail, which is not a problem if we are going to
decomission the node but leaves garbage otherwise. I'd propose in the first version
1) Don't attempt deletion at all if node status is `offline`.
2) If it failed, just issue warning.
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
remove garbage timelines for manual use. It will 1) list all timelines on the
safekeeper 2) compare each one against configuration storage: if timeline
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
be deleted under generation number if node is not member of current generation.
Automating this is untrivial; we'd need to register all potential missing
deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
which switches configurations. Similarly when timeline is fully deleted to
prevent cplane operation from blocking when some safekeeper is not available
deletion should be also registered.
One more task pool should infinitely retry notifying control plane about changed
safekeeper sets.
3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
current in memory state of the timeline and pending `MigrationRequest`,
if any.
4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
(incrementing generation as always).
#### Dealing with multiple instances of storage_controller
Operations described above executed concurrently might create some errors but do
not prevent progress, so while we normally don't want to run multiple instances
of storage_controller it is fine to have it temporarily, e.g. during redeploy.
Any interactions with db update in-memory controller state, e.g. if migration
request failed because different one is in progress, controller remembers that
and tries to finish it.
## Testing
`neon_local` should be switched to use storage_controller, playing role of
control plane.
There should be following layers of tests:
1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
2) To cover real code and at the same time test many schedules we should have
simulation tests. For that, configuration storage, storage_controller <->
safekeeper communication and pull_timeline need to be mocked and main switch
procedure wrapped to as a node (thread) in simulation tests, using these
mocks. Test would inject migrations like it currently injects
safekeeper/walproposer restars. Main assert is the same -- committed WAL must
not be lost.
3) Since simulation testing injects at relatively high level points (not
syscalls), it omits some code, in particular `pull_timeline`. Thus it is
better to have basic tests covering whole system as well. Extended version of
`test_restarts_under_load` would do: start background load and do migration
under it, then restart endpoint and check that no reported commits
had been lost. I'd also add one more creating classic network split scenario, with
one compute talking to AC and another to BD while migration from nodes ABC to ABD
happens.
4) Simple e2e test should ensure that full flow including cplane notification works.
## Order of implementation and rollout
Note that
- Control plane parts and integration with it is fully independent from everything else
(tests would use simulation and neon_local).
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
and its impl/rollout should be separate from migration itself.
- Initially walproposer can just stop working while it observers joint configuration.
Such window would be typically very short anyway.
To rollout smoothly, both walproposer and safekeeper should have flag
`configurations_enabled`; when set to false, they would work as currently, i.e.
walproposer is able to commit on whatever safekeeper set it is provided. Until
all timelines are managed by storcon we'd need to use current script to migrate
and update/drop entries in the storage_controller database if it has any.
Safekeepers would need to be able to talk both current and new protocol version
with compute to reduce number of computes restarted in prod once v2 protocol is
deployed (though before completely switching we'd need to force this).
Let's have the following rollout order:
- storage_controller becomes aware of safekeepers;
- storage_controller gets timeline creation for new timelines and deletion requests, but
doesn't manage all timelines yet. Migration can be tested on these new timelines.
To keep control plane and storage_controller databases in sync while control
plane still chooses the safekeepers initially (until all timelines are imported
it can choose better), `TimelineCreateRequest` can get optional safekeepers
field with safekeepers chosen by cplane.
- Then we can import all existing timelines from control plane to
storage_controller and gradually enable configurations region by region.
Very rough implementation order:
- Add concept of configurations to safekeepers (including control file),
implement v3 protocol.
- Implement walproposer changes, including protocol.
- Implement storconn part. Use it in neon_local (and pytest).
- Make cplane store safekeepers per timeline instead of per tenant.
- Implement cplane/storcon integration. Route branch creation/deletion
through storcon. Then we can test migration of new branches.
- Finally import existing branches. Then we can drop cplane
safekeeper selection code. Gradually enable configurations at
computes and safekeepers. Before that, all computes must talk only
v3 protocol version.
## Integration with evicted timelines
Currently, `pull_timeline` doesn't work correctly with evicted timelines because
copy would point to original partial file. To fix let's just do s3 copy of the
file. It is a bit stupid as generally unnecessary work, but it makes sense to
implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
## Possible optimizations
Steps above suggest walproposer restart (with re-election) and thus reconnection
to safekeepers. Since by bumping term on new majority we ensure that leader
terms are unique even across generation switches it is possible to preserve
connections. However, it is more complicated, reconnection is very fast and it
is much more important to avoid compute restart than millisecond order of write
stall.
Multiple joint consensus: algorithm above rejects attempt to change membership
while another attempt is in progress. It is possible to overlay them and AFAIK
Aurora does this but similarly I don't think this is needed.
## Misc
We should use Compute <-> safekeeper protocol change to include other (long
yearned) modifications:
- send data in network order to make arm work.
- remove term_start_lsn from AppendRequest
- add horizon to TermHistory
- add to ProposerGreeting number of connection from this wp to sk

View File

@@ -1,265 +0,0 @@
# Physical Replication
This RFC is a bit special in that we have already implemented physical
replication a long time ago. However, we never properly wrote down all
the decisions and assumptions, and in the last months when more users
have started to use the feature, numerous issues have surfaced.
This RFC documents the design decisions that have been made.
## Summary
PostgreSQL has a feature called streaming replication, where a replica
streams WAL from the primary and continuously applies it. It is also
known as "physical replication", to distinguish it from logical
replication. In PostgreSQL, a replica is initialized by taking a
physical backup of the primary. In Neon, the replica is initialized
from a slim "base backup" from the pageserver, just like a primary,
and the primary and the replicas connect to the same pageserver,
sharing the storage.
There are two kinds of read-only replicas in Neon:
- replicas that follow the primary, and
- "static" replicas that are pinned at a particular LSN.
A static replica is useful e.g. for performing time-travel queries and
running one-off slow queries without affecting the primary. A replica
that follows the primary can be used e.g. to scale out read-only
workloads.
## Motivation
Read-only replicas allow offloading read-only queries. It's useful for
isolation, if you want to make sure that read-only queries don't
affect the primary, and it's also an easy way to provide guaranteed
read-only access to an application, without having to mess with access
controls.
## Non Goals (if relevant)
This RFC is all about WAL-based *physical* replication. Logical
replication is a different feature.
Neon also has the capability to launch "static" read-only nodes which
do not follow the primary, but are pinned to a particular LSN. They
can be used for long-running one-off queries, or for Point-in-time
queries. They work similarly to read replicas that follow the primary,
but some things are simpler: there are no concerns about cache
invalidation when the data changes on the primary, or worrying about
transactions that are in-progress on the primary.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
- Control plane launches the replica
- Replica Postgres instance connects to the safekeepers, to stream the WAL
- The primary does not know about the standby, except for the hot standby feedback
- The primary and replicas all connect to the same pageservers
# Context
Some useful things to know about hot standby and replicas in
PostgreSQL.
## PostgreSQL startup sequence
"Running" and "start up" terms are little imprecise. PostgreSQL
replica startup goes through several stages:
1. First, the process is started up, and various initialization steps
are performed, like initializing shared memory. If you try to
connect to the server in this stage, you get an error: ERROR: the
database system is starting up. This stage happens very quickly, no
2. Then the server reads the checpoint record from the WAL and starts
the WAL replay starting from the checkpoint. This works differently
in Neon: we start the WAL replay at the basebackup LSN, not from a
checkpoint! If you connect to the server in this state, you get an
error: ERROR: the database system is not yet accepting
connections. We proceed to the next stage, when the WAL replay sees
a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
can allow us to move directly to next stage, with all the caveats
listed in this RFC.
3. When the running-xacts information is established, the server
starts to accept connections normally.
From PostgreSQL's point of view, the server is already running in
stage 2, even though it's not accepting connections yet. Our
`compute_ctl` does not consider it as running until stage 3. If the
transition from stage 2 to 3 doesn't happen fast enough, the control
plane will mark the start operation as failed.
## Decisions, Issues
### Cache invalidation in replica
When a read replica follows the primary in PostgreSQL, it needs to
stream all the WAL from the primary and apply all the records, to keep
the local copy of the data consistent with the primary. In Neon, the
replica can fetch the updated page versions from the pageserver, so
it's not necessary to apply all the WAL. However, it needs to ensure
that any pages that are currently in the Postgres buffer cache, or the
Local File Cache, are either updated, or thrown away so that the next
read of the page will fetch the latest version.
We choose to apply the WAL records for pages that are already in the
buffer cache, and skip records for other pages. Somewhat arbitrarily,
we also apply records affecting catalog relations, fetching the old
page version from the pageserver if necessary first. See
`neon_redo_read_buffer_filter()` function.
The replica wouldn't necessarily need to see all the WAL records, only
the records that apply to cached pages. For simplicity, we do stream
all the WAL to the replica, and the replica simply ignores WAL records
that require no action.
Like in PostgreSQL, the read replica maintains a "replay LSN", which
is the LSN up to which the replica has received and replayed the
WAL. The replica can lag behind the primary, if it cannot quite keep
up with the primary, or if a long-running query conflicts with changes
that are about to be applied, or even intentionally if the user wishes
to see delayed data (see recovery_min_apply_delay). It's important
that the replica sees a consistent view of the whole cluster at the
replay LSN, when it's lagging behind.
In Neon, the replica connects to a safekeeper to get the WAL
stream. That means that the safekeepers must be able to regurgitate
the original WAL as far back as the replay LSN of any running read
replica. (A static read-only node that does not follow the primary
does not require a WAL stream however). The primary does not need to
be running, and when it is, the replicas don't incur any extra
overhead to the primary (see hot standby feedback though).
### In-progress transactions
In PostgreSQL, when a hot standby server starts up, it cannot
immediately open up for queries (see [PostgreSQL startup
sequence]). It first needs to establish a complete list of in-progress
transactions, including subtransactions, that are running at the
primary, at the current replay LSN. Normally that happens quickly,
when the replica sees a "running-xacts" WAL record, because the
primary writes a running-xacts WAL record at every checkpoint, and in
PostgreSQL the replica always starts the WAL replay from a checkpoint
REDO point. (A shutdown checkpoint WAL record also implies that all
the non-prepared transactions have ended.) If there are a lot of
subtransactions in progress, however, the standby might need to wait
for old transactions to complete before it can open up for queries.
In Neon that problem is worse: a replica can start at any LSN, so
there's no guarantee that it will see a running-xacts record any time
soon. In particular, if the primary is not running when the replica is
started, it might never see a running-xacts record.
To make things worse, we initially missed this issue, and always
started accepting queries at replica startup, even if it didn't have
the transaction information. That could lead to incorrect query
results and data corruption later. However, as we fixed that, we
introduced a new problem compared to what we had before: previously
the replica would always start up, but after fixing that bug, it might
not. In a superficial way, the old behavior was better (but could lead
to serious issues later!). That made fixing that bug was very hard,
because as we fixed it, we made things (superficially) worse for
others.
See https://github.com/neondatabase/neon/pull/7288 which fixed the
bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
and https://github.com/neondatabase/neon/pull/8484 to try to claw back
the cases that started to cause trouble as fixing it. As of this
writing, there are still cases where a replica might not immediately
start up, causing the control plane operation to fail, the remaining
issues are tracked in https://github.com/neondatabase/neon/issues/6211.
One long-term fix for this is to switch to using so-called CSN
snapshots in read replica. That would make it unnecessary to have the
full in-progress transaction list in the replica at startup time. See
https://commitfest.postgresql.org/48/4912/ for a work-in-progress
patch to upstream to implement that.
Another thing we could do is to teach the control plane about that
distinction between "starting up" and "running but haven't received
running-xacts information yet", so that we could keep the replica
waiting longer in that stage, and also give any client connections the
same `ERROR: the database system is not yet accepting connections`
error that you get in standalone PostgreSQL in that state.
### Recovery conflicts and Hot standby feedback
It's possible that a tuple version is vacuumed away in the primary,
even though it is still needed by a running transactions in the
replica. This is called a "recovery conflict", and PostgreSQL provides
various options for dealing with it. By default, the WAL replay will
wait up to 30 s for the conflicting query to finish. After that, it
will kill the running query, so that the WAL replay can proceed.
Another way to avoid the situation is to enable the
[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
option. When it is enabled, the primary will refrain from vacuuming
tuples that are still needed in the primary. That means potentially
bloating the primary, which violates the usual rule that read replicas
don't affect the operations on the primary, which is why it's off by
default. We leave it to users to decide if they want to turn it on,
same as PostgreSQL.
Neon supports `hot_standby_feedback` by passing the feedback messages
from the replica to the safekeepers, and from safekeepers to the
primary.
### Relationship of settings between primary and replica
In order to enter hot standby mode, some configuration options need to
be set to the same or larger values in the standby, compared to the
primary. See [explanation in the PostgreSQL
docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
In Neon, we have this problem too. To prevent customers from hitting
it, the control plane automatically adjusts the settings of a replica,
so that they match or exceed the primary's settings (see
https://github.com/neondatabase/cloud/issues/14903). However, you
can still hit the issue if the primary is restarted with larger
settings, while the replica is running.
### Interaction with Pageserver GC
The read replica can lag behind the primary. If there are recovery
conflicts or the replica cannot keep up for some reason, the lag can
in principle grow indefinitely. The replica will issue all GetPage
requests to the pageservers at the current replay LSN, and needs to
see the old page versions.
If the retention period in the pageserver is set to be small, it may
have already garbage collected away the old page versions. That will
cause read errors in the compute, and can mean that the replica cannot
make progress with the replication anymore.
There is a mechanism for replica to pass information about its replay
LSN to the pageserver, so that the pageserver refrains from GC'ing
data that is still needed by the standby. It's called
'standby_horizon' in the pageserver code, see
https://github.com/neondatabase/neon/pull/7368. A separate "lease"
mechanism also is in the works, where the replica could hold a lease
on the old LSN, preventing the pageserver from advancing the GC
horizon past that point. The difference is that the standby_horizon
mechanism relies on a feedback message from replica to safekeeper,
while the least API is exposed directly from the pageserver. A static
read-only node is not connected to safekeepers, so it cannot use the
standby_horizon mechanism.
### Synchronous replication
We haven't put any effort into synchronous replication yet.
PostgreSQL provides multiple levels of synchronicity. In the weaker
levels, a transaction is not acknowledged as committed to the client
in the primary until the WAL has been streamed to a replica or flushed
to disk there. Those modes don't make senses in Neon, because the
safekeepers handle durability.
`synchronous_commit=remote_apply` mode would make sense. In that mode,
the commit is not acknowledged to the client until it has been
replayed in the replica. That ensures that after commit, you can see
the commit in the replica too (aka. read-your-write consistency).

View File

@@ -1,259 +0,0 @@
# Rolling Storage Controller Restarts
## Summary
This RFC describes the issues around the current storage controller restart procedure
and describes an implementation which reduces downtime to a few milliseconds on the happy path.
## Motivation
Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
While the storage controller does not sit on the main data path, it's generally not acceptable
to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
### Current Implementation
The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
a new instance is created.
At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
under unfavourable circumstances: pageservers are heavily loaded or unavailable.
## Prior Art
There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
For fail-over, traffic is routed to one of the standbys (which becomes active).
* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
## Requirements
* Reduce storage controller unavailability during upgrades to milliseconds
* Minimize the interval in which it's possible for more than one storage controller
to issue reconciles.
* Have one uniform implementation for restarts and upgrades
* Fit in with the current Kubernetes deployment scheme
## Non Goals
* Implement our own consensus algorithm from scratch
* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
like a transient error to the control plane
## Impacted Components
* storage controller
* deployment orchestration (i.e. Ansible)
* helm charts
## Terminology
* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
at start-up by quering pageservers
* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
a set of replicas
## Implementation
### High Level Flow
At a very high level the proposed idea is to start a new storage controller instance while
the previous one is still running and cut-over to it when it becomes ready. The new instance,
should coordinate with the existing one and transition responsibility gracefully. While the controller
has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
were operating at the same time and require operator intervention to remedy.
### Kubernetes Deployment Configuration
On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
### Storage Controller Start-Up
This section describes the primitives required on the storage controller side and the flow of the happy path.
#### Database Table For Leader Synchronization
A new table should be added to the storage controller database for leader synchronization during startup.
This table will always contain at most one row. The proposed name for the table is `leader` and the schema
contains two elements:
* `hostname`: represents the hostname for the current storage controller leader - should be addressible
from other pods in the deployment
* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
our needs here.
```
START TRANSACTION ISOLATION LEVEL REPEATABLE READ
UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
```
If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
#### Step Down API
A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
snapshot of the observed state.
If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
for failure scenario handling - see [Handling Failures](#handling-failures)).
#### Graceful Restart Happy Path
At start-up, the first thing the storage controller does is retrieve the sole row from the new
`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
pageservers in order to build up the observed state.
Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
section. If this step fails, the storage controller process exits.
Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
Summary of proposed new start-up sequence:
1. Call `/step_down`
2. Perform any pending database migrations
3. Load state from database
4. Load observed state returned in step (1) into memory
5. Do initial heartbeat round (may be moved after 5)
7. Mark self as leader by updating the database
8. Reschedule and reconcile everything
Some things to note from the steps above:
* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
calls to the pageserver and no compute notifications)
* Ask the current leader to step down before loading state from database so we don't get a lost update
if the transactions overlap.
* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
fall back to asking the pageservers about their current locations.
* Database migrations should only run **after** the previous instance steps down (or the step down times out).
[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
### Handling Failures
#### Storage Controller Crash Or Restart
The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
exists and consistency is maintained.
#### Previous Leader Crashes Before New Leader Readiness
When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
(see [2]).
Now we have two cases to consider:
* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
by Kubernetes depending on timings.
* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
should avoid this self reference and fail the API call at the client if the persisted hostname matches
the current one.
#### Previous Leader Crashes After New Leader Readiness
The deployment's replica sets already satisfy the deployment's replica count requirements and the
Kubernetes deployment rollout will just clean up the dead pod.
#### New Leader Crashes Before Pasing Readiness Check
The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
with the new pod.
#### Network Partition Between New Pod and Previous Leader
This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
### Dealing With Split Brain Scenarios
As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
### Ensure Leadership Before Producing Side Effects
The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
### Leadership Lease
Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
to be renewed periodically. Two new columns would be added to the leaders table:
1. `last_renewed` - timestamp indicating when the lease was last renewed
2. `lease_duration` - duration indicating the amount of time after which the lease expires
The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
### Notify Pageserver Of Storage Controller Term
Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
anything which contains a stale term (i.e. smaller than the current one).
### Observability
* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
Per region alerts should be added on this metric which triggers when:
+ no storage controller has been in the `Active` state for an extended period of time
+ more than one storage controllers are in the `Active` state
* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
## Alternatives
### Kubernetes Leases
Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
In our case, it would work something like this:
* `/step_down` deletes the lease or stops it from renewing
* lease acquisition becomes part of the start-up procedure
The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
not exactly trivial to implement.
This approach has the benefit of baked in observability (`kubectl describe lease`), but:
* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
* More code surface than the simple "row in database" approach. Also, most of this code would be in
a dependency not subject to code review, etc.
* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
so is not simple and complictes and the test set-up.
To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
to something external.

View File

@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
1. Create a new branch based on the stable branch you are updating.
```shell
git checkout -b my-branch-15 REL_15_STABLE_neon
git checkout -b my-branch REL_15_STABLE_neon
```
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
1. Tag the last commit on the stable branch you are updating.
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
```shell
git tag REL_15_3_neon
```
1. Push the new tag to the Neon Postgres repository.
```shell
git push origin REL_15_3_neon
```
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
1. Rebase the branch you created on the tag and resolve any conflicts.
```shell
git fetch upstream REL_15_4
git merge REL_15_4
git rebase REL_15_4
```
In the commit message of the merge commit, mention if there were
any non-trivial conflicts or other issues.
1. Run the Postgres test suite to make sure our commits have not affected
Postgres in a negative way.
@@ -48,7 +57,7 @@ Postgres in a negative way.
1. Push your branch to the Neon Postgres repository.
```shell
git push origin my-branch-15
git push origin my-branch
```
1. Clone the Neon repository if you have not done so already.
@@ -65,7 +74,7 @@ branch.
1. Update the Git submodule.
```shell
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
git submodule set-branch --branch my-branch vendor/postgres-v15
git submodule update --remote vendor/postgres-v15
```
@@ -80,12 +89,14 @@ minor Postgres release.
1. Create a pull request, and wait for CI to go green.
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
1. Force push the rebased Postgres branches into the Neon Postgres repository.
```shell
git push origin my-branch-15:REL_15_STABLE_neon
git push --force origin my-branch:REL_15_STABLE_neon
```
It may require disabling various branch protections.
1. Update your Neon PR to point at the branches.
```shell

View File

@@ -14,3 +14,5 @@ regex.workspace = true
utils = { path = "../utils" }
remote_storage = { version = "0.1", path = "../remote_storage/" }
workspace_hack.workspace = true

View File

@@ -6,8 +6,10 @@ license = "Apache-2.0"
[dependencies]
anyhow.workspace = true
chrono = { workspace = true, features = ["serde"] }
chrono.workspace = true
rand.workspace = true
serde.workspace = true
serde_with.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -14,3 +14,5 @@ parking_lot.workspace = true
hex.workspace = true
scopeguard.workspace = true
smallvec = { workspace = true, features = ["write"] }
workspace_hack.workspace = true

View File

@@ -12,6 +12,8 @@ chrono.workspace = true
twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
measured-process.workspace = true

View File

@@ -21,9 +21,11 @@ hex.workspace = true
humantime.workspace = true
thiserror.workspace = true
humantime-serde.workspace = true
chrono = { workspace = true, features = ["serde"] }
chrono.workspace = true
itertools.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
bincode.workspace = true
rand.workspace = true

View File

@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};
use crate::models::PageserverUtilization;
use crate::{
models::{ShardParameters, TenantConfig},
shard::{ShardStripeSize, TenantShardId},
@@ -56,8 +55,6 @@ pub struct NodeRegisterRequest {
pub listen_http_addr: String,
pub listen_http_port: u16,
pub availability_zone_id: Option<String>,
}
#[derive(Serialize, Deserialize)]
@@ -143,11 +140,23 @@ pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
}
#[derive(Serialize, Clone, Debug)]
/// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
pub struct UtilizationScore(pub u64);
impl UtilizationScore {
pub fn worst() -> Self {
UtilizationScore(u64::MAX)
}
}
#[derive(Serialize, Clone, Copy, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
// Normal, happy state
Active(PageserverUtilization),
Active(UtilizationScore),
// Node is warming up, but we expect it to become available soon. Covers
// the time span between the re-attach response being composed on the storage controller
// and the first successful heartbeat after the processing of the re-attach response
@@ -186,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
match val {
// Assume the worst utilisation score to begin with. It will later be updated by
// the heartbeats.
NodeAvailabilityWrapper::Active => {
NodeAvailability::Active(PageserverUtilization::full())
}
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
}
@@ -306,17 +313,20 @@ pub struct MetadataHealthUpdateRequest {
pub struct MetadataHealthUpdateResponse {}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListUnhealthyResponse {
pub unhealthy_tenant_shards: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedRequest {
#[serde(with = "humantime_serde")]
pub not_scrubbed_for: Duration,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedResponse {
pub health_records: Vec<MetadataHealthRecord>,
}

View File

@@ -22,11 +22,6 @@ pub struct Key {
pub field6: u32,
}
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
/// a struct of fields.
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
pub struct CompactKey(i128);
/// The storage key size.
pub const KEY_SIZE: usize = 18;
@@ -108,41 +103,14 @@ impl Key {
}
}
/// This function checks more extensively what keys we can take on the write path.
/// If a key beginning with 00 does not have a global/default tablespace OID, it
/// will be rejected on the write path.
#[allow(dead_code)]
pub fn is_valid_key_on_write_path_strong(&self) -> bool {
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
if !self.is_i128_representable() {
return false;
}
if self.field1 == 0
&& !(self.field2 == GLOBALTABLESPACE_OID
|| self.field2 == DEFAULTTABLESPACE_OID
|| self.field2 == 0)
{
return false; // User defined tablespaces are not supported
}
true
}
/// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
/// checks if the key is i128 representable. Note that some keys can be successfully
/// ingested into the pageserver, but will cause errors on generating basebackup.
pub fn is_valid_key_on_write_path(&self) -> bool {
self.is_i128_representable()
}
pub fn is_i128_representable(&self) -> bool {
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
}
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.is_i128_representable(), "invalid key: {self}");
assert!(
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
"invalid key: {self}",
);
(((self.field1 & 0x7F) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
@@ -162,14 +130,6 @@ impl Key {
}
}
pub fn to_compact(&self) -> CompactKey {
CompactKey(self.to_i128())
}
pub fn from_compact(k: CompactKey) -> Self {
Self::from_i128(k.0)
}
pub const fn next(&self) -> Key {
self.add(1)
}
@@ -239,13 +199,6 @@ impl fmt::Display for Key {
}
}
impl fmt::Display for CompactKey {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let k = Key::from_compact(*self);
k.fmt(f)
}
}
impl Key {
pub const MIN: Key = Key {
field1: u8::MIN,
@@ -263,15 +216,6 @@ impl Key {
field5: u8::MAX,
field6: u32::MAX,
};
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
pub const NON_L0_MAX: Key = Key {
field1: u8::MAX,
field2: u32::MAX,
field3: u32::MAX,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX - 1,
};
pub fn from_hex(s: &str) -> Result<Self> {
if s.len() != 36 {

View File

@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
pub fn default_tenant_config() -> Self {
Self::V2
Self::V1
}
}
@@ -486,11 +486,12 @@ pub struct EvictionPolicyLayerAccessThreshold {
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ThrottleConfig {
pub task_kinds: Vec<String>, // TaskKind
pub initial: u32,
pub initial: usize,
#[serde(with = "humantime_serde")]
pub refill_interval: Duration,
pub refill_amount: NonZeroU32,
pub max: u32,
pub refill_amount: NonZeroUsize,
pub max: usize,
pub fair: bool,
}
impl ThrottleConfig {
@@ -500,8 +501,9 @@ impl ThrottleConfig {
// other values don't matter with emtpy `task_kinds`.
initial: 0,
refill_interval: Duration::from_millis(1),
refill_amount: NonZeroU32::new(1).unwrap(),
refill_amount: NonZeroUsize::new(1).unwrap(),
max: 1,
fair: true,
}
}
/// The requests per second allowed by the given config.
@@ -719,14 +721,8 @@ pub struct TimelineInfo {
pub walreceiver_status: String,
// ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
// Backward compatibility: you will get a JSON not containing the newly-added field.
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// The last aux file policy being used on this timeline
pub last_aux_file_policy: Option<AuxFilePolicy>,
pub is_archived: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -1066,7 +1062,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
}
}
// A GetPage request contains two LSN values:
// In the V2 protocol version, a GetPage request contains two LSN values:
//
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
// "get the latest version present". It's used by the primary server, which knows that no one else
@@ -1079,7 +1075,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
// request without waiting for 'request_lsn' to arrive.
//
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
// standby to request a page at a particular non-latest LSN, and also include the
@@ -1087,11 +1083,15 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
// difference in the responses between V1 and V2.
//
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
// maps the old format requests to the new format.
//
#[derive(Clone, Copy)]
pub enum PagestreamProtocolVersion {
V1,
V2,
}
@@ -1230,17 +1230,36 @@ impl PagestreamFeMessage {
bytes.into()
}
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
pub fn parse<R: std::io::Read>(
body: &mut R,
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
// these two fields are the same for every request type
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
let (request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V1 => {
// In the old protocol, each message starts with a boolean 'latest' flag,
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
// 'not_modified_since', used in the new protocol version.
let latest = body.read_u8()? != 0;
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
if latest {
(Lsn::MAX, request_lsn) // get latest version
} else {
(request_lsn, request_lsn) // get version at specified LSN
}
}
};
// The rest of the messages are the same between V1 and V2
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn,
@@ -1448,7 +1467,9 @@ mod tests {
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
let reconstructed =
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
.unwrap();
assert!(msg == reconstructed);
}
}

View File

@@ -1,5 +1,4 @@
use std::time::SystemTime;
use utils::{serde_percent::Percent, serde_system_time};
use utils::serde_system_time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
@@ -10,143 +9,19 @@ use utils::{serde_percent::Percent, serde_system_time};
/// not handle full u64 values properly.
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct PageserverUtilization {
/// Used disk space (physical, ground truth from statfs())
/// Used disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub disk_usage_bytes: u64,
/// Free disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub free_space_bytes: u64,
/// Wanted disk space, based on the tenant shards currently present on this pageserver: this
/// is like disk_usage_bytes, but it is stable and does not change with the cache state of
/// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
/// there, or may be unrealistically low if the pageserver has attached tenants which haven't
/// downloaded layers yet.
#[serde(serialize_with = "ser_saturating_u63", default)]
pub disk_wanted_bytes: u64,
// What proportion of total disk space will this pageserver use before it starts evicting data?
#[serde(default = "unity_percent")]
pub disk_usable_pct: Percent,
// How many shards are currently on this node?
#[serde(default)]
pub shard_count: u32,
// How many shards should this node be able to handle at most?
#[serde(default)]
pub max_shard_count: u32,
/// Cached result of [`Self::score`]
pub utilization_score: Option<u64>,
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
#[serde(serialize_with = "ser_saturating_u63")]
pub utilization_score: u64,
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
pub captured_at: serde_system_time::SystemTime,
}
fn unity_percent() -> Percent {
Percent::new(0).unwrap()
}
pub type RawScore = u64;
impl PageserverUtilization {
const UTILIZATION_FULL: u64 = 1000000;
/// Calculate a utilization score. The result is to be inrepreted as a fraction of
/// Self::UTILIZATION_FULL.
///
/// Lower values are more affine to scheduling more work on this node.
/// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
/// - 0.0 represents an empty node.
/// - Negative values are forbidden
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
/// layer eviction.
pub fn score(&self) -> RawScore {
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
* self.disk_usable_pct.get() as u64)
/ 100;
let disk_utilization_score =
self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
let shard_utilization_score =
self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
std::cmp::max(disk_utilization_score, shard_utilization_score)
}
pub fn cached_score(&mut self) -> RawScore {
match self.utilization_score {
None => {
let s = self.score();
self.utilization_score = Some(s);
s
}
Some(s) => s,
}
}
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
pub fn is_overloaded(score: RawScore) -> bool {
score >= Self::UTILIZATION_FULL
}
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
if self.shard_count < shard_count {
self.shard_count = shard_count;
// Dirty cache: this will be calculated next time someone retrives the score
self.utilization_score = None;
}
}
/// A utilization structure that has a full utilization score: use this as a placeholder when
/// you need a utilization but don't have real values yet.
pub fn full() -> Self {
Self {
disk_usage_bytes: 1,
free_space_bytes: 0,
disk_wanted_bytes: 1,
disk_usable_pct: Percent::new(100).unwrap(),
shard_count: 1,
max_shard_count: 1,
utilization_score: Some(Self::UTILIZATION_FULL),
captured_at: serde_system_time::SystemTime(SystemTime::now()),
}
}
}
/// Test helper
pub mod test_utilization {
use super::PageserverUtilization;
use std::time::SystemTime;
use utils::{
serde_percent::Percent,
serde_system_time::{self},
};
// Parameters of the imaginary node used for test utilization instances
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
const TEST_SHARDS_MAX: u32 = 1000;
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
/// not abuse this function from non-test code.
///
/// Emulates a node with a 1000 shard limit and a 1TB disk.
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
PageserverUtilization {
disk_usage_bytes: disk_wanted_bytes,
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
disk_wanted_bytes,
disk_usable_pct: Percent::new(100).unwrap(),
shard_count,
max_shard_count: TEST_SHARDS_MAX,
utilization_score: None,
captured_at: serde_system_time::SystemTime(SystemTime::now()),
}
}
pub captured_at: SystemTime,
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
@@ -174,19 +49,15 @@ mod tests {
let doc = PageserverUtilization {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
disk_wanted_bytes: u64::MAX,
utilization_score: Some(13),
disk_usable_pct: Percent::new(90).unwrap(),
shard_count: 100,
max_shard_count: 200,
captured_at: serde_system_time::SystemTime(
utilization_score: u64::MAX,
captured_at: SystemTime(
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
};
let s = serde_json::to_string(&doc).unwrap();
let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
assert_eq!(s, expected);
}

View File

@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
tracing.workspace = true
pq_proto.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
once_cell.workspace = true

View File

@@ -11,5 +11,7 @@ postgres.workspace = true
tokio-postgres.workspace = true
url.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
once_cell.workspace = true

View File

@@ -19,6 +19,8 @@ thiserror.workspace = true
serde.workspace = true
utils.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
env_logger.workspace = true
postgres.workspace = true

View File

@@ -136,15 +136,15 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
// Export some version independent functions that are used outside of this mod
pub use v14::xlog_utils::encode_logical_message;
pub use v14::xlog_utils::from_pg_timestamp;
pub use v14::xlog_utils::get_current_timestamp;
pub use v14::xlog_utils::to_pg_timestamp;
pub use v14::xlog_utils::try_from_pg_timestamp;
pub use v14::xlog_utils::XLogFileName;
pub use v14::bindings::DBState_DB_SHUTDOWNED;
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
}
pub fn generate_wal_segment(

View File

@@ -135,8 +135,6 @@ pub fn get_current_timestamp() -> TimestampTz {
mod timestamp_conversions {
use std::time::Duration;
use anyhow::Context;
use super::*;
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
@@ -156,18 +154,18 @@ mod timestamp_conversions {
}
}
pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
let time: u64 = time
.try_into()
.context("timestamp before millenium (postgres epoch)")?;
.expect("timestamp before millenium (postgres epoch)");
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
SystemTime::UNIX_EPOCH
.checked_add(Duration::from_micros(since_unix_epoch))
.context("SystemTime overflow")
.expect("SystemTime overflow")
}
}
pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
// start_lsn must point to some previously known record boundary (beginning of
@@ -547,14 +545,14 @@ mod tests {
#[test]
fn test_ts_conversion() {
let now = SystemTime::now();
let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
let now_pg = get_current_timestamp();
let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
assert_eq!(now_pg, round_trip_pg);
}

View File

@@ -14,6 +14,8 @@ postgres.workspace = true
postgres_ffi.workspace = true
camino-tempfile.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
regex.workspace = true
utils.workspace = true

View File

@@ -11,7 +11,9 @@ itertools.workspace = true
pin-project-lite.workspace = true
postgres-protocol.workspace = true
rand.workspace = true
tokio = { workspace = true, features = ["io-util"] }
tokio.workspace = true
tracing.workspace = true
thiserror.workspace = true
serde.workspace = true
workspace_hack.workspace = true

View File

@@ -32,7 +32,7 @@ scopeguard.workspace = true
metrics.workspace = true
utils.workspace = true
pin-project-lite.workspace = true
workspace_hack.workspace = true
azure_core.workspace = true
azure_identity.workspace = true
azure_storage.workspace = true
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
camino-tempfile.workspace = true
test-context.workspace = true
rand.workspace = true
tokio = { workspace = true, features = ["test-util"] }

View File

@@ -383,48 +383,6 @@ impl RemoteStorage for AzureBlobStorage {
}
}
async fn head_object(
&self,
key: &RemotePath,
cancel: &CancellationToken,
) -> Result<ListingObject, DownloadError> {
let kind = RequestKind::Head;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
let properties_future = blob_client.get_properties().into_future();
let properties_future = tokio::time::timeout(self.timeout, properties_future);
let res = tokio::select! {
res = properties_future => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
if let Ok(inner) = &res {
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, inner, started_at);
}
let data = match res {
Ok(Ok(data)) => Ok(data),
Ok(Err(sdk)) => Err(to_download_error(sdk)),
Err(_timeout) => Err(DownloadError::Timeout),
}?;
let properties = data.blob.properties;
Ok(ListingObject {
key: key.to_owned(),
last_modified: SystemTime::from(properties.last_modified),
size: properties.content_length,
})
}
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,

View File

@@ -42,10 +42,6 @@ impl DownloadError {
Timeout | Other(_) => false,
}
}
pub fn is_cancelled(&self) -> bool {
matches!(self, DownloadError::Cancelled)
}
}
impl From<std::io::Error> for DownloadError {

View File

@@ -150,7 +150,7 @@ pub enum ListingMode {
NoDelimiter,
}
#[derive(PartialEq, Eq, Debug, Clone)]
#[derive(PartialEq, Eq, Debug)]
pub struct ListingObject {
pub key: RemotePath,
pub last_modified: SystemTime,
@@ -215,13 +215,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
Ok(combined)
}
/// Obtain metadata information about an object.
async fn head_object(
&self,
key: &RemotePath,
cancel: &CancellationToken,
) -> Result<ListingObject, DownloadError>;
/// Streams the local file contents into remote into the remote storage entry.
///
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -370,20 +363,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
// See [`RemoteStorage::head_object`].
pub async fn head_object(
&self,
key: &RemotePath,
cancel: &CancellationToken,
) -> Result<ListingObject, DownloadError> {
match self {
Self::LocalFs(s) => s.head_object(key, cancel).await,
Self::AwsS3(s) => s.head_object(key, cancel).await,
Self::AzureBlob(s) => s.head_object(key, cancel).await,
Self::Unreliable(s) => s.head_object(key, cancel).await,
}
}
/// See [`RemoteStorage::upload`]
pub async fn upload(
&self,
@@ -619,7 +598,6 @@ impl ConcurrencyLimiter {
RequestKind::Delete => &self.write,
RequestKind::Copy => &self.write,
RequestKind::TimeTravel => &self.write,
RequestKind::Head => &self.read,
}
}

View File

@@ -445,20 +445,6 @@ impl RemoteStorage for LocalFs {
}
}
async fn head_object(
&self,
key: &RemotePath,
_cancel: &CancellationToken,
) -> Result<ListingObject, DownloadError> {
let target_file_path = key.with_base(&self.storage_root);
let metadata = file_metadata(&target_file_path).await?;
Ok(ListingObject {
key: key.clone(),
last_modified: metadata.modified()?,
size: metadata.len(),
})
}
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,

View File

@@ -13,7 +13,6 @@ pub(crate) enum RequestKind {
List = 3,
Copy = 4,
TimeTravel = 5,
Head = 6,
}
use scopeguard::ScopeGuard;
@@ -28,7 +27,6 @@ impl RequestKind {
List => "list_objects",
Copy => "copy_object",
TimeTravel => "time_travel_recover",
Head => "head_object",
}
}
const fn as_index(&self) -> usize {
@@ -36,8 +34,7 @@ impl RequestKind {
}
}
const REQUEST_KIND_COUNT: usize = 7;
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
pub(crate) struct RequestTyped<C>([C; 6]);
impl<C> RequestTyped<C> {
pub(crate) fn get(&self, kind: RequestKind) -> &C {
@@ -46,8 +43,8 @@ impl<C> RequestTyped<C> {
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
let arr = std::array::from_fn::<C, 6, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)

View File

@@ -23,7 +23,7 @@ use aws_config::{
use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::{get_object::GetObjectError, head_object::HeadObjectError},
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
Client,
};
@@ -604,78 +604,6 @@ impl RemoteStorage for S3Bucket {
}
}
async fn head_object(
&self,
key: &RemotePath,
cancel: &CancellationToken,
) -> Result<ListingObject, DownloadError> {
let kind = RequestKind::Head;
let _permit = self.permit(kind, cancel).await?;
let started_at = start_measuring_requests(kind);
let head_future = self
.client
.head_object()
.bucket(self.bucket_name())
.key(self.relative_path_to_s3_object(key))
.send();
let head_future = tokio::time::timeout(self.timeout, head_future);
let res = tokio::select! {
res = head_future => res,
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
};
let res = res.map_err(|_e| DownloadError::Timeout)?;
// do not incl. timeouts as errors in metrics but cancellations
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
let data = match res {
Ok(object_output) => object_output,
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
// an error: we expect to sometimes fetch an object and find it missing,
// e.g. when probing for timeline indices.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
return Err(DownloadError::NotFound);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
return Err(DownloadError::Other(
anyhow::Error::new(e).context("s3 head object"),
));
}
};
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
return Err(DownloadError::Other(anyhow!(
"head_object doesn't contain last_modified or content_length"
)))?;
};
Ok(ListingObject {
key: key.to_owned(),
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
})?,
size: size as u64,
})
}
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,

View File

@@ -30,7 +30,6 @@ pub struct UnreliableWrapper {
#[derive(Debug, Hash, Eq, PartialEq)]
enum RemoteOp {
ListPrefixes(Option<RemotePath>),
HeadObject(RemotePath),
Upload(RemotePath),
Download(RemotePath),
Delete(RemotePath),
@@ -138,16 +137,6 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.list(prefix, mode, max_keys, cancel).await
}
async fn head_object(
&self,
key: &RemotePath,
cancel: &CancellationToken,
) -> Result<crate::ListingObject, DownloadError> {
self.attempt(RemoteOp::HeadObject(key.clone()))
.map_err(DownloadError::Other)?;
self.inner.head_object(key, cancel).await
}
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,

View File

@@ -9,3 +9,5 @@ serde.workspace = true
serde_with.workspace = true
const_format.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -9,3 +9,5 @@ license.workspace = true
anyhow.workspace = true
serde.workspace = true
serde_json.workspace = true
workspace_hack.workspace = true

View File

@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
workspace_hack.workspace = true

View File

@@ -14,6 +14,7 @@ testing = ["fail/failpoints"]
arc-swap.workspace = true
sentry.workspace = true
async-compression.workspace = true
async-trait.workspace = true
anyhow.workspace = true
bincode.workspace = true
bytes.workspace = true
@@ -25,6 +26,7 @@ hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true
leaky-bucket.workspace = true
nix.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true
@@ -37,7 +39,7 @@ thiserror.workspace = true
tokio.workspace = true
tokio-tar.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = ["serde"] }
toml_edit.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -52,6 +54,7 @@ walkdir.workspace = true
pq_proto.workspace = true
postgres_connection.workspace = true
metrics.workspace = true
workspace_hack.workspace = true
const_format.workspace = true
@@ -68,7 +71,6 @@ criterion.workspace = true
hex-literal.workspace = true
camino-tempfile.workspace = true
serde_assert.workspace = true
tokio = { workspace = true, features = ["test-util"] }
[[bench]]
name = "benchmarks"

View File

@@ -5,40 +5,13 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion {
token: TaskTrackerToken,
}
impl std::fmt::Debug for Completion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Completion")
.field("siblings", &self.token.task_tracker().len())
.finish()
}
}
impl Completion {
/// Returns true if this completion is associated with the given barrier.
pub fn blocks(&self, barrier: &Barrier) -> bool {
TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
}
pub fn barrier(&self) -> Barrier {
Barrier(self.token.task_tracker().clone())
}
_token: TaskTrackerToken,
}
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(TaskTracker);
impl std::fmt::Debug for Barrier {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Barrier")
.field("remaining", &self.0.len())
.finish()
}
}
impl Default for Barrier {
fn default() -> Self {
let (_, rx) = channel();
@@ -78,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
tracker.close();
let token = tracker.token();
(Completion { token }, Barrier(tracker))
(Completion { _token: token }, Barrier(tracker))
}

View File

@@ -1,280 +0,0 @@
//! This module implements the Generic Cell Rate Algorithm for a simplified
//! version of the Leaky Bucket rate limiting system.
//!
//! # Leaky Bucket
//!
//! If the bucket is full, no new requests are allowed and are throttled/errored.
//! If the bucket is partially full/empty, new requests are added to the bucket in
//! terms of "tokens".
//!
//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
//!
//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
//!
//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
//!
//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
//!
//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
//!
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
use std::{sync::Mutex, time::Duration};
use tokio::{sync::Notify, time::Instant};
pub struct LeakyBucketConfig {
/// This is the "time cost" of a single request unit.
/// Should loosely represent how long it takes to handle a request unit in active resource time.
/// Loosely speaking this is the inverse of the steady-rate requests-per-second
pub cost: Duration,
/// total size of the bucket
pub bucket_width: Duration,
}
impl LeakyBucketConfig {
pub fn new(rps: f64, bucket_size: f64) -> Self {
let cost = Duration::from_secs_f64(rps.recip());
let bucket_width = cost.mul_f64(bucket_size);
Self { cost, bucket_width }
}
}
pub struct LeakyBucketState {
/// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
///
/// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
/// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
/// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
/// Draining the bucket will happen naturally as `now` moves forward.
///
/// Let `n` be some "time cost" for the request,
/// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
/// If now is within the `bucket window + n`, we are within time budget.
/// If now is before the `bucket window + n`, we have run out of budget.
///
/// This is inspired by the generic cell rate algorithm (GCRA) and works
/// exactly the same as a leaky-bucket.
pub empty_at: Instant,
}
impl LeakyBucketState {
pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
LeakyBucketState {
empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
}
}
pub fn bucket_is_empty(&self, now: Instant) -> bool {
// if self.end is after now, the bucket is not empty
self.empty_at <= now
}
/// Immediately adds tokens to the bucket, if there is space.
///
/// In a scenario where you are waiting for available rate,
/// rather than just erroring immediately, `started` corresponds to when this waiting started.
///
/// `n` is the number of tokens that will be filled in the bucket.
///
/// # Errors
///
/// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
/// there will be space again.
pub fn add_tokens(
&mut self,
config: &LeakyBucketConfig,
started: Instant,
n: f64,
) -> Result<(), Instant> {
let now = Instant::now();
// invariant: started <= now
debug_assert!(started <= now);
// If the bucket was empty when we started our search,
// we should update the `empty_at` value accordingly.
// this prevents us from having negative tokens in the bucket.
let mut empty_at = self.empty_at;
if empty_at < started {
empty_at = started;
}
let n = config.cost.mul_f64(n);
let new_empty_at = empty_at + n;
let allow_at = new_empty_at.checked_sub(config.bucket_width);
// empty_at
// allow_at | new_empty_at
// / | /
// -------o-[---------o-|--]---------
// now1 ^ now2 ^
//
// at now1, the bucket would be completely filled if we add n tokens.
// at now2, the bucket would be partially filled if we add n tokens.
match allow_at {
Some(allow_at) if now < allow_at => Err(allow_at),
_ => {
self.empty_at = new_empty_at;
Ok(())
}
}
}
}
pub struct RateLimiter {
pub config: LeakyBucketConfig,
pub state: Mutex<LeakyBucketState>,
/// a queue to provide this fair ordering.
pub queue: Notify,
}
struct Requeue<'a>(&'a Notify);
impl Drop for Requeue<'_> {
fn drop(&mut self) {
self.0.notify_one();
}
}
impl RateLimiter {
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
RateLimiter {
state: Mutex::new(LeakyBucketState::with_initial_tokens(
&config,
initial_tokens,
)),
config,
queue: {
let queue = Notify::new();
queue.notify_one();
queue
},
}
}
pub fn steady_rps(&self) -> f64 {
self.config.cost.as_secs_f64().recip()
}
/// returns true if we did throttle
pub async fn acquire(&self, count: usize) -> bool {
let mut throttled = false;
let start = tokio::time::Instant::now();
// wait until we are the first in the queue
let mut notified = std::pin::pin!(self.queue.notified());
if !notified.as_mut().enable() {
throttled = true;
notified.await;
}
// notify the next waiter in the queue when we are done.
let _guard = Requeue(&self.queue);
loop {
let res = self
.state
.lock()
.unwrap()
.add_tokens(&self.config, start, count as f64);
match res {
Ok(()) => return throttled,
Err(ready_at) => {
throttled = true;
tokio::time::sleep_until(ready_at).await;
}
}
}
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use tokio::time::Instant;
use super::{LeakyBucketConfig, LeakyBucketState};
#[tokio::test(start_paused = true)]
async fn check() {
let config = LeakyBucketConfig {
// average 100rps
cost: Duration::from_millis(10),
// burst up to 100 requests
bucket_width: Duration::from_millis(1000),
};
let mut state = LeakyBucketState {
empty_at: Instant::now(),
};
// supports burst
{
// should work for 100 requests this instant
for _ in 0..100 {
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
}
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
}
// doesn't overfill
{
// after 1s we should have an empty bucket again.
tokio::time::advance(Duration::from_secs(1)).await;
assert!(state.bucket_is_empty(Instant::now()));
// after 1s more, we should not over count the tokens and allow more than 200 requests.
tokio::time::advance(Duration::from_secs(1)).await;
for _ in 0..100 {
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
}
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
}
// supports sustained rate over a long period
{
tokio::time::advance(Duration::from_secs(1)).await;
// should sustain 100rps
for _ in 0..2000 {
tokio::time::advance(Duration::from_millis(10)).await;
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
}
}
// supports requesting more tokens than can be stored in the bucket
// we just wait a little bit longer upfront.
{
// start the bucket completely empty
tokio::time::advance(Duration::from_secs(5)).await;
assert!(state.bucket_is_empty(Instant::now()));
// requesting 200 tokens of space should take 200*cost = 2s
// but we already have 1s available, so we wait 1s from start.
let start = Instant::now();
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
assert_eq!(ready - Instant::now(), Duration::from_secs(1));
tokio::time::advance(Duration::from_millis(500)).await;
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
assert_eq!(ready - Instant::now(), Duration::from_millis(500));
tokio::time::advance(Duration::from_millis(500)).await;
state.add_tokens(&config, start, 200.0).unwrap();
// bucket should be completely full now
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
}
}
}

View File

@@ -71,7 +71,6 @@ pub mod postgres_client;
pub mod tracing_span_assert;
pub mod leaky_bucket;
pub mod rate_limit;
/// Simple once-barrier and a guard which keeps barrier awaiting.

View File

@@ -5,15 +5,6 @@ use std::time::{Duration, Instant};
pub struct RateLimit {
last: Option<Instant>,
interval: Duration,
dropped: u64,
}
pub struct RateLimitStats(u64);
impl std::fmt::Display for RateLimitStats {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{} dropped calls", self.0)
}
}
impl RateLimit {
@@ -21,27 +12,20 @@ impl RateLimit {
Self {
last: None,
interval,
dropped: 0,
}
}
/// Call `f` if the rate limit allows.
/// Don't call it otherwise.
pub fn call<F: FnOnce()>(&mut self, f: F) {
self.call2(|_| f())
}
pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
let now = Instant::now();
match self.last {
Some(last) if now - last <= self.interval => {
// ratelimit
self.dropped += 1;
}
_ => {
self.last = Some(now);
f(RateLimitStats(self.dropped));
self.dropped = 0;
f();
}
}
}

View File

@@ -9,6 +9,8 @@ anyhow.workspace = true
utils.workspace = true
postgres_ffi.workspace = true
workspace_hack.workspace = true
[build-dependencies]
anyhow.workspace = true
bindgen.workspace = true

View File

@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
.allowlist_var("ERROR")
.allowlist_var("FATAL")
.allowlist_var("PANIC")
.allowlist_var("PG_VERSION_NUM")
.allowlist_var("WPEVENT")
.allowlist_var("WL_LATCH_SET")
.allowlist_var("WL_SOCKET_READABLE")

View File

@@ -282,11 +282,7 @@ mod tests {
use std::cell::UnsafeCell;
use utils::id::TenantTimelineId;
use crate::{
api_bindings::Level,
bindings::{NeonWALReadResult, PG_VERSION_NUM},
walproposer::Wrapper,
};
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
use super::ApiImpl;
@@ -493,79 +489,41 @@ mod tests {
let (sender, receiver) = sync_channel(1);
// Messages definitions are at walproposer.h
// xxx: it would be better to extract them from safekeeper crate and
// use serialization/deserialization here.
let greeting_tag = (b'g' as u64).to_ne_bytes();
let proto_version = 2_u32.to_ne_bytes();
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
let proposer_id = [0; 16];
let system_id = 0_u64.to_ne_bytes();
let tenant_id = ttid.tenant_id.as_arr();
let timeline_id = ttid.timeline_id.as_arr();
let pg_tli = 1_u32.to_ne_bytes();
let wal_seg_size = 16777216_u32.to_ne_bytes();
let proposer_greeting = [
greeting_tag.as_slice(),
proto_version.as_slice(),
pg_version.as_slice(),
proposer_id.as_slice(),
system_id.as_slice(),
tenant_id.as_slice(),
timeline_id.as_slice(),
pg_tli.as_slice(),
wal_seg_size.as_slice(),
]
.concat();
let voting_tag = (b'v' as u64).to_ne_bytes();
let vote_request_term = 3_u64.to_ne_bytes();
let proposer_id = [0; 16];
let vote_request = [
voting_tag.as_slice(),
vote_request_term.as_slice(),
proposer_id.as_slice(),
]
.concat();
let acceptor_greeting_term = 2_u64.to_ne_bytes();
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
let acceptor_greeting = [
greeting_tag.as_slice(),
acceptor_greeting_term.as_slice(),
acceptor_greeting_node_id.as_slice(),
]
.concat();
let vote_response_term = 3_u64.to_ne_bytes();
let vote_given = 1_u64.to_ne_bytes();
let flush_lsn = 0x539_u64.to_ne_bytes();
let truncate_lsn = 0x539_u64.to_ne_bytes();
let th_len = 1_u32.to_ne_bytes();
let th_term = 2_u64.to_ne_bytes();
let th_lsn = 0x539_u64.to_ne_bytes();
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
let vote_response = [
voting_tag.as_slice(),
vote_response_term.as_slice(),
vote_given.as_slice(),
flush_lsn.as_slice(),
truncate_lsn.as_slice(),
th_len.as_slice(),
th_term.as_slice(),
th_lsn.as_slice(),
timeline_start_lsn.as_slice(),
]
.concat();
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
wait_events: Cell::new(WaitEventsData {
sk: std::ptr::null_mut(),
event_mask: 0,
}),
expected_messages: vec![proposer_greeting, vote_request],
expected_messages: vec![
// TODO: When updating Postgres versions, this test will cause
// problems. Postgres version in message needs updating.
//
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
],
// VoteRequest(VoteRequest { term: 3 })
vec![
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
],
],
expected_ptr: AtomicUsize::new(0),
safekeeper_replies: vec![acceptor_greeting, vote_response],
safekeeper_replies: vec![
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
],
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
vec![
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
],
],
replies_ptr: AtomicUsize::new(0),
sync_channel: sender,
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),

View File

@@ -16,7 +16,6 @@ arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
bit_field.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true
@@ -37,6 +36,7 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
leaky-bucket.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
@@ -52,7 +52,6 @@ rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true
scopeguard.workspace = true
send-future.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_path_to_error.workspace = true

View File

@@ -4,13 +4,12 @@ use bytes::Bytes;
use camino::Utf8PathBuf;
use criterion::{criterion_group, criterion_main, Criterion};
use pageserver::{
config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
l0_flush::{L0FlushConfig, L0FlushGlobalState},
page_cache,
repository::Value,
task_mgr::TaskKind,
tenant::storage_layer::inmemory_layer::SerializedBatch,
tenant::storage_layer::InMemoryLayer,
virtual_file,
};
@@ -68,16 +67,12 @@ async fn ingest(
let layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
let data_ser_size = data.serialized_size().unwrap() as usize;
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
let ctx = RequestContext::new(
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
pageserver::context::DownloadBehavior::Download,
);
const BATCH_SIZE: usize = 16;
let mut batch = Vec::new();
for i in 0..put_count {
lsn += put_size as u64;
@@ -100,17 +95,7 @@ async fn ingest(
}
}
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
if batch.len() >= BATCH_SIZE {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
layer.put_batch(serialized, &ctx).await?;
}
}
if !batch.is_empty() {
let this_batch = std::mem::take(&mut batch);
let serialized = SerializedBatch::from_values(this_batch).unwrap();
layer.put_batch(serialized, &ctx).await?;
layer.put_value(key, lsn, &data, &ctx).await?;
}
layer.freeze(lsn + 1).await;
@@ -164,11 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
let conf: &'static PageServerConf = Box::leak(Box::new(
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
));
virtual_file::init(
16384,
virtual_file::io_engine_for_bench(),
DEFAULT_IO_BUFFER_ALIGNMENT,
);
virtual_file::init(16384, virtual_file::io_engine_for_bench());
page_cache::init(conf.page_cache_size);
{

View File

@@ -7,6 +7,7 @@ license.workspace = true
[dependencies]
pageserver_api.workspace = true
thiserror.workspace = true
async-trait.workspace = true
reqwest = { workspace = true, features = [ "stream" ] }
utils.workspace = true
serde.workspace = true

View File

@@ -419,24 +419,6 @@ impl Client {
}
}
pub async fn timeline_archival_config(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
req: &TimelineArchivalConfigRequest,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
self.mgmt_api_endpoint
);
self.request(Method::POST, &uri, req)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn timeline_detach_ancestor(
&self,
tenant_shard_id: TenantShardId,
@@ -524,16 +506,6 @@ impl Client {
.map_err(Error::ReceiveBody)
}
/// Configs io buffer alignment at runtime.
pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
self.request(Method::PUT, uri, align)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
self.get(uri)

View File

@@ -4,7 +4,6 @@
use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -145,11 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
DEFAULT_IO_BUFFER_ALIGNMENT,
);
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
pageserver::page_cache::init(100);
let mut total_delta_layers = 0usize;

View File

@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use clap::Subcommand;
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::block_io::BlockCursor;
@@ -60,7 +59,7 @@ pub(crate) enum LayerCmd {
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let file = VirtualFile::open(path, ctx).await?;
let file_id = page_cache::next_file_id();
@@ -90,7 +89,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
for (k, v) in all {
let value = cursor.read_blob(v.pos(), ctx).await?;
println!("key:{} value_len:{}", k, value.len());
assert!(k.is_i128_representable(), "invalid key: ");
}
// TODO(chi): special handling for last key?
Ok(())
@@ -191,11 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
DEFAULT_IO_BUFFER_ALIGNMENT,
);
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

View File

@@ -20,7 +20,6 @@ use clap::{Parser, Subcommand};
use index_part::IndexPartCmd;
use layers::LayerCmd;
use pageserver::{
config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
context::{DownloadBehavior, RequestContext},
page_cache,
task_mgr::TaskKind,
@@ -206,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(
10,
virtual_file::api::IoEngineKind::StdFs,
DEFAULT_IO_BUFFER_ALIGNMENT,
);
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await

View File

@@ -58,11 +58,6 @@ pub(crate) struct Args {
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
#[clap(long)]
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
/// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
#[clap(long)]
set_io_alignment: Option<usize>,
targets: Option<Vec<TenantTimelineId>>,
}
@@ -129,10 +124,6 @@ async fn main_impl(
mgmt_api_client.put_io_engine(engine_str).await?;
}
if let Some(align) = args.set_io_alignment {
mgmt_api_client.put_io_alignment(align).await?;
}
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,

View File

@@ -1,39 +0,0 @@
//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
}
};
pub(crate) trait U64IsUsize {
fn into_usize(self) -> usize;
}
impl U64IsUsize for u64 {
#[inline(always)]
fn into_usize(self) -> usize {
#[allow(clippy::let_unit_value)]
let _ = _ASSERT_U64_EQ_USIZE;
self as usize
}
}
pub(crate) trait UsizeIsU64 {
fn into_u64(self) -> u64;
}
impl UsizeIsU64 for usize {
#[inline(always)]
fn into_u64(self) -> u64 {
#[allow(clippy::let_unit_value)]
let _ = _ASSERT_U64_EQ_USIZE;
self as u64
}
}
pub const fn u64_to_usize(x: u64) -> usize {
#[allow(clippy::let_unit_value)]
let _ = _ASSERT_U64_EQ_USIZE;
x as usize
}

View File

@@ -124,70 +124,21 @@ fn main() -> anyhow::Result<()> {
// after setting up logging, log the effective IO engine choice and read path implementations
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
info!(?conf.get_impl, "starting with get page implementation");
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
// The tenants directory contains all the pageserver local disk state.
// Create if not exists and make sure all the contents are durable before proceeding.
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
let tenants_path = conf.tenants_path();
{
let open = || {
nix::dir::Dir::open(
tenants_path.as_std_path(),
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
nix::sys::stat::Mode::empty(),
)
};
let dirfd = match open() {
Ok(dirfd) => dirfd,
Err(e) => match e {
nix::errno::Errno::ENOENT => {
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
format!("Failed to create tenants root dir at '{tenants_path}'")
})?;
open().context("open tenants dir after creating it")?
}
e => anyhow::bail!(e),
},
};
let started = Instant::now();
// Linux guarantees durability for syncfs.
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
#[cfg(target_os = "linux")]
{
use std::os::fd::AsRawFd;
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
}
#[cfg(target_os = "macos")]
{
// macOS is not a production platform for Neon, don't even bother.
drop(dirfd);
}
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
{
compile_error!("Unsupported OS");
}
let elapsed = started.elapsed();
info!(
elapsed_ms = elapsed.as_millis(),
"made tenant directory contents durable"
);
if !tenants_path.exists() {
utils::crashsafe::create_dir_all(conf.tenants_path())
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
}
// Initialize up failpoints support
let scenario = failpoint_support::init();
// Basic initialization of things that don't change after startup
virtual_file::init(
conf.max_file_descriptors,
conf.virtual_file_io_engine,
conf.io_buffer_alignment,
);
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
page_cache::init(conf.page_cache_size);
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

View File

@@ -29,13 +29,12 @@ use utils::{
logging::LogFormat,
};
use crate::l0_flush::L0FlushConfig;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
use crate::{tenant::config::TenantConf, virtual_file};
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
@@ -51,6 +50,7 @@ pub mod defaults {
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT,
};
use pageserver_api::models::ImageCompressionAlgorithm;
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -90,14 +90,13 @@ pub mod defaults {
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Disabled;
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
///
/// Default built-in configuration file.
///
@@ -134,8 +133,14 @@ pub mod defaults {
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
#get_impl = '{DEFAULT_GET_IMPL}'
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -273,8 +278,14 @@ pub struct PageServerConf {
pub virtual_file_io_engine: virtual_file::IoEngineKind,
pub get_vectored_impl: GetVectoredImpl,
pub get_impl: GetImpl,
pub max_vectored_read_bytes: MaxVectoredReadBytes,
pub validate_vectored_get: bool,
pub image_compression: ImageCompressionAlgorithm,
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
@@ -292,8 +303,6 @@ pub struct PageServerConf {
/// Direct IO settings
pub virtual_file_direct_io: virtual_file::DirectIoMode,
pub io_buffer_alignment: usize,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -387,8 +396,14 @@ struct PageServerConfigBuilder {
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
get_vectored_impl: BuilderValue<GetVectoredImpl>,
get_impl: BuilderValue<GetImpl>,
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
validate_vectored_get: BuilderValue<bool>,
image_compression: BuilderValue<ImageCompressionAlgorithm>,
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
@@ -398,8 +413,6 @@ struct PageServerConfigBuilder {
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
io_buffer_alignment: BuilderValue<usize>,
}
impl PageServerConfigBuilder {
@@ -480,15 +493,17 @@ impl PageServerConfigBuilder {
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
)),
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
l0_flush: Set(L0FlushConfig::default()),
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
}
}
}
@@ -644,10 +659,22 @@ impl PageServerConfigBuilder {
self.virtual_file_io_engine = BuilderValue::Set(value);
}
pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
self.get_vectored_impl = BuilderValue::Set(value);
}
pub fn get_impl(&mut self, value: GetImpl) {
self.get_impl = BuilderValue::Set(value);
}
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
self.max_vectored_read_bytes = BuilderValue::Set(value);
}
pub fn get_validate_vectored_get(&mut self, value: bool) {
self.validate_vectored_get = BuilderValue::Set(value);
}
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
self.image_compression = BuilderValue::Set(value);
}
@@ -668,10 +695,6 @@ impl PageServerConfigBuilder {
self.virtual_file_direct_io = BuilderValue::Set(value);
}
pub fn io_buffer_alignment(&mut self, value: usize) {
self.io_buffer_alignment = BuilderValue::Set(value);
}
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
@@ -722,13 +745,15 @@ impl PageServerConfigBuilder {
heatmap_upload_concurrency,
secondary_download_concurrency,
ingest_batch_size,
get_vectored_impl,
get_impl,
max_vectored_read_bytes,
validate_vectored_get,
image_compression,
ephemeral_bytes_per_memory_kb,
l0_flush,
compact_level0_phase1_value_access,
virtual_file_direct_io,
io_buffer_alignment,
}
CUSTOM LOGIC
{
@@ -977,12 +1002,21 @@ impl PageServerConf {
"virtual_file_io_engine" => {
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
}
"get_vectored_impl" => {
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
}
"get_impl" => {
builder.get_impl(parse_toml_from_str("get_impl", item)?)
}
"max_vectored_read_bytes" => {
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
builder.get_max_vectored_read_bytes(
MaxVectoredReadBytes(
NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
}
"validate_vectored_get" => {
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
}
"image_compression" => {
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
}
@@ -998,9 +1032,6 @@ impl PageServerConf {
"virtual_file_direct_io" => {
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
}
"io_buffer_alignment" => {
builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
}
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -1021,15 +1052,6 @@ impl PageServerConf {
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
.map_err(|msg| anyhow::anyhow!("{msg}"))
.with_context(|| {
format!(
"effective checkpoint distance is unsupported: {}",
conf.default_tenant_conf.checkpoint_distance
)
})?;
Ok(conf)
}
@@ -1084,16 +1106,18 @@ impl PageServerConf {
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant"),
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
}
}
}
@@ -1325,16 +1349,18 @@ background_task_maximum_delay = '334 s'
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
},
"Correct defaults should be used when no config values are provided"
);
@@ -1399,16 +1425,18 @@ background_task_maximum_delay = '334 s'
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
max_vectored_read_bytes: MaxVectoredReadBytes(
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
.expect("Invalid default constant")
),
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
l0_flush: L0FlushConfig::default(),
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -1,8 +1,6 @@
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
use crate::config::PageServerConf;
use crate::consumption_metrics::metrics::MetricsKey;
use crate::consumption_metrics::upload::KeyGen as _;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::size::CalculateSyntheticSizeError;
@@ -10,7 +8,6 @@ use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use itertools::Itertools as _;
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
use reqwest::Url;
@@ -22,8 +19,9 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::NodeId;
mod disk_cache;
mod metrics;
use crate::consumption_metrics::metrics::MetricsKey;
mod disk_cache;
mod upload;
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
@@ -145,12 +143,6 @@ async fn collect_metrics(
// these are point in time, with variable "now"
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
// Pre-generate event idempotency keys, to reuse them across the bucket
// and HTTP sinks.
let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
.take(metrics.len())
.collect_vec();
let metrics = Arc::new(metrics);
// why not race cancellation here? because we are one of the last tasks, and if we are
@@ -169,14 +161,8 @@ async fn collect_metrics(
}
if let Some(bucket_client) = &bucket_client {
let res = upload::upload_metrics_bucket(
bucket_client,
&cancel,
&node_id,
&metrics,
&idempotency_keys,
)
.await;
let res =
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
if let Err(e) = res {
tracing::error!("failed to upload to S3: {e:#}");
}
@@ -188,9 +174,9 @@ async fn collect_metrics(
&client,
metric_collection_endpoint,
&cancel,
&node_id,
&metrics,
&mut cached_metrics,
&idempotency_keys,
)
.await;
if let Err(e) = res {

View File

@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
cancel: &CancellationToken,
node_id: &str,
metrics: &[RawMetric],
cached_metrics: &mut Cache,
idempotency_keys: &[IdempotencyKey<'_>],
) -> anyhow::Result<()> {
let mut uploaded = 0;
let mut failed = 0;
let started_at = std::time::Instant::now();
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
while let Some(res) = iter.next() {
let (chunk, body) = res?;
@@ -87,7 +87,6 @@ pub(super) async fn upload_metrics_bucket(
cancel: &CancellationToken,
node_id: &str,
metrics: &[RawMetric],
idempotency_keys: &[IdempotencyKey<'_>],
) -> anyhow::Result<()> {
if metrics.is_empty() {
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
@@ -107,7 +106,7 @@ pub(super) async fn upload_metrics_bucket(
// Serialize and write into compressed buffer
let started_at = std::time::Instant::now();
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
let (_chunk, body) = res?;
gzip_writer.write_all(&body).await?;
}
@@ -135,31 +134,29 @@ pub(super) async fn upload_metrics_bucket(
Ok(())
}
/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
/// idempotency keys are injected into the corresponding metric events (reused
/// across different metrics sinks), and must have the same length as input.
fn serialize_in_chunks<'a>(
// The return type is quite ugly, but we gain testability in isolation
fn serialize_in_chunks<'a, F>(
chunk_size: usize,
input: &'a [RawMetric],
idempotency_keys: &'a [IdempotencyKey<'a>],
factory: F,
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
where
F: KeyGen<'a> + 'a,
{
use bytes::BufMut;
assert_eq!(input.len(), idempotency_keys.len());
struct Iter<'a> {
struct Iter<'a, F> {
inner: std::slice::Chunks<'a, RawMetric>,
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
chunk_size: usize,
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
buffer: bytes::BytesMut,
// chunk amount of events are reused to produce the serialized document
scratch: Vec<Event<Ids, Name>>,
factory: F,
}
impl<'a> Iterator for Iter<'a> {
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
fn next(&mut self) -> Option<Self::Item> {
@@ -170,14 +167,17 @@ fn serialize_in_chunks<'a>(
self.scratch.extend(
chunk
.iter()
.zip(&mut self.idempotency_keys)
.map(|(raw_metric, key)| raw_metric.as_event(key)),
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
);
} else {
// next rounds: update_in_place to reuse allocations
assert_eq!(self.scratch.len(), self.chunk_size);
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
self.scratch
.iter_mut()
.zip(chunk.iter())
.for_each(|(slot, raw_metric)| {
raw_metric.update_in_place(slot, &self.factory.generate())
});
}
let res = serde_json::to_writer(
@@ -198,19 +198,18 @@ fn serialize_in_chunks<'a>(
}
}
impl<'a> ExactSizeIterator for Iter<'a> {}
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
let buffer = bytes::BytesMut::new();
let inner = input.chunks(chunk_size);
let idempotency_keys = idempotency_keys.iter();
let scratch = Vec::new();
Iter {
inner,
idempotency_keys,
chunk_size,
buffer,
scratch,
factory,
}
}
@@ -269,7 +268,7 @@ impl RawMetricExt for RawMetric {
}
}
pub(crate) trait KeyGen<'a> {
trait KeyGen<'a>: Copy {
fn generate(&self) -> IdempotencyKey<'a>;
}
@@ -390,10 +389,7 @@ mod tests {
let examples = metric_samples();
assert!(examples.len() > 1);
let now = Utc::now();
let idempotency_keys = (0..examples.len())
.map(|i| FixedGen::new(now, "1", i as u16).generate())
.collect::<Vec<_>>();
let factory = FixedGen::new(Utc::now(), "1", 42);
// need to use Event here because serde_json::Value uses default hashmap, not linked
// hashmap
@@ -402,13 +398,13 @@ mod tests {
events: Vec<Event<Ids, Name>>,
}
let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
let correct = serialize_in_chunks(examples.len(), &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();
for chunk_size in 1..examples.len() {
let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
let actual = serialize_in_chunks(chunk_size, &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();

View File

@@ -105,10 +105,8 @@ pub struct RequestContext {
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
pub enum PageContentKind {
Unknown,
DeltaLayerSummary,
DeltaLayerBtreeNode,
DeltaLayerValue,
ImageLayerSummary,
ImageLayerBtreeNode,
ImageLayerValue,
InMemoryLayer,

View File

@@ -141,18 +141,12 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
m.other
);
let az_id = m
.other
.get("availability_zone_id")
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
Some(NodeRegisterRequest {
node_id: conf.id,
listen_pg_addr: m.postgres_host,
listen_pg_port: m.postgres_port,
listen_http_addr: m.http_host,
listen_http_port: m.http_port,
availability_zone_id: az_id,
})
}
Err(e) => {

View File

@@ -64,7 +64,7 @@ use crate::{
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
},
CancellableTask, DiskUsageEvictionTask,
};
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
}
impl EvictionOrder {
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
use EvictionOrder::*;
match self {
@@ -644,7 +644,6 @@ pub(crate) struct EvictionCandidate {
pub(crate) layer: EvictionLayer,
pub(crate) last_activity_ts: SystemTime,
pub(crate) relative_last_activity: finite_f32::FiniteF32,
pub(crate) visibility: LayerVisibilityHint,
}
impl std::fmt::Display for EvictionLayer {
@@ -686,22 +685,14 @@ impl std::fmt::Debug for EvictionCandidate {
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
enum EvictionPartition {
// A layer that is un-wanted by the tenant: evict all these first, before considering
// any other layers
EvictNow,
// Above the minimum size threshold: this layer is a candidate for eviction.
enum MinResidentSizePartition {
Above,
// Below the minimum size threshold: this layer should only be evicted if all the
// tenants' layers above the minimum size threshold have already been considered.
Below,
}
enum EvictionCandidates {
Cancelled,
Finished(Vec<(EvictionPartition, EvictionCandidate)>),
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
}
/// Gather the eviction candidates.
@@ -899,10 +890,8 @@ async fn collect_eviction_candidates(
max_layer_size
};
// Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
// where the inputs are:
// - whether the layer is visible
// - whether the layer is above/below the min_resident_size cutline
// Sort layers most-recently-used first, then partition by
// cumsum above/below min_resident_size.
tenant_candidates
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
@@ -919,23 +908,12 @@ async fn collect_eviction_candidates(
candidate.relative_last_activity =
eviction_order.relative_last_activity(total, i);
let partition = match candidate.visibility {
LayerVisibilityHint::Covered => {
// Covered layers are evicted first
EvictionPartition::EvictNow
}
LayerVisibilityHint::Visible => {
cumsum += i128::from(candidate.layer.get_file_size());
if cumsum > min_resident_size as i128 {
EvictionPartition::Above
} else {
// The most recent layers below the min_resident_size threshold
// are the last to be evicted.
EvictionPartition::Below
}
}
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
} else {
MinResidentSizePartition::Below
};
cumsum += i128::from(candidate.layer.get_file_size());
(partition, candidate)
});
@@ -1003,7 +981,7 @@ async fn collect_eviction_candidates(
// Secondary locations' layers are always considered above the min resident size,
// i.e. secondary locations are permitted to be trimmed to zero layers if all
// the layers have sufficiently old access times.
EvictionPartition::Above,
MinResidentSizePartition::Above,
candidate,
)
});
@@ -1031,9 +1009,7 @@ async fn collect_eviction_candidates(
}
}
debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
eviction_order.sort(&mut candidates);
@@ -1046,7 +1022,7 @@ async fn collect_eviction_candidates(
///
/// Returns the amount of candidates selected, with the planned usage.
fn select_victims<U: Usage>(
candidates: &[(EvictionPartition, EvictionCandidate)],
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
usage_pre: U,
) -> VictimSelection<U> {
let mut usage_when_switched = None;
@@ -1058,7 +1034,7 @@ fn select_victims<U: Usage>(
break;
}
if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
usage_when_switched = Some((usage_planned, i));
}

View File

@@ -178,8 +178,10 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
impl From<PageReconstructError> for ApiError {
fn from(pre: PageReconstructError) -> ApiError {
match pre {
PageReconstructError::Other(other) => ApiError::InternalServerError(other),
PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()),
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
PageReconstructError::MissingKey(e) => {
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
}
PageReconstructError::Cancelled => ApiError::Cancelled,
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
@@ -318,27 +320,6 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
}
}
impl From<crate::tenant::TimelineArchivalError> for ApiError {
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
use crate::tenant::TimelineArchivalError::*;
match value {
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
e @ HasArchivedParent(_) => {
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
}
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
format!(
"Cannot archive timeline which has non-archived child timelines: {children:?}"
)
.into_boxed_str(),
),
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
Other(e) => ApiError::InternalServerError(e),
}
}
}
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
use crate::tenant::mgr::DeleteTimelineError::*;
@@ -426,8 +407,6 @@ async fn build_timeline_info_common(
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
// Report is_archived = false if the timeline is still loading
let is_archived = timeline.is_archived().unwrap_or(false);
let remote_consistent_lsn_projected = timeline
.get_remote_consistent_lsn_projected()
.unwrap_or(Lsn(0));
@@ -468,7 +447,6 @@ async fn build_timeline_info_common(
pg_version: timeline.pg_version,
state,
is_archived: Some(is_archived),
walreceiver_status,
@@ -710,7 +688,9 @@ async fn timeline_archival_config_handler(
tenant
.apply_timeline_archival_config(timeline_id, request_data.state)
.await?;
.await
.context("applying archival config")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_archival_config",
@@ -874,10 +854,7 @@ async fn get_timestamp_of_lsn_handler(
match result {
Some(time) => {
let time = format_rfc3339(
postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
)
.to_string();
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
json_response(StatusCode::OK, time)
}
None => Err(ApiError::NotFound(
@@ -1731,12 +1708,13 @@ async fn timeline_compact_handler(
flags |= CompactFlags::ForceImageLayerCreation;
}
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
if !cfg!(feature = "testing") {
return Err(ApiError::InternalServerError(anyhow!(
"enhanced_gc_bottom_most_compaction is only available in testing mode"
)));
}
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
}
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
flags |= CompactFlags::DryRun;
}
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -1809,11 +1787,9 @@ async fn timeline_checkpoint_handler(
}
if wait_until_uploaded {
tracing::info!("Waiting for uploads to complete...");
timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?;
tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
}
json_response(StatusCode::OK, ())
@@ -1911,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
// drop(tenant);
let resp = match progress {
detach_ancestor::Progress::Prepared(attempt, prepared) => {
detach_ancestor::Progress::Prepared(_guard, prepared) => {
// it would be great to tag the guard on to the tenant activation future
let reparented_timelines = state
.tenant_manager
@@ -1919,10 +1895,11 @@ async fn timeline_detach_ancestor_handler(
tenant_shard_id,
timeline_id,
prepared,
attempt,
ctx,
)
.await?;
.await
.context("timeline detach ancestor completion")
.map_err(ApiError::InternalServerError)?;
AncestorDetached {
reparented_timelines,
@@ -2354,20 +2331,6 @@ async fn put_io_engine_handler(
json_response(StatusCode::OK, ())
}
async fn put_io_alignment_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let align: usize = json_request(&mut r).await?;
crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
ApiError::PreconditionFailed(
format!("Requested io alignment ({align}) is not a power of two").into(),
)
})?;
json_response(StatusCode::OK, ())
}
/// Polled by control plane.
///
/// See [`crate::utilization`].
@@ -2394,9 +2357,8 @@ async fn get_utilization(
// regenerate at most 1Hz to allow polling at any rate.
if !still_valid {
let path = state.conf.tenants_path();
let doc =
crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
.map_err(ApiError::InternalServerError)?;
let doc = crate::utilization::regenerate(path.as_std_path())
.map_err(ApiError::InternalServerError)?;
let mut buf = Vec::new();
serde_json::to_writer(&mut buf, &doc)
@@ -2980,7 +2942,7 @@ pub fn make_router(
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| api_handler(r, timeline_compact_handler),
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
@@ -3055,9 +3017,6 @@ pub fn make_router(
|r| api_handler(r, timeline_collect_keyspace),
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.put("/v1/io_alignment", |r| {
api_handler(r, put_io_alignment_handler)
})
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|r| api_handler(r, force_aux_policy_switch_handler),

View File

@@ -19,7 +19,6 @@ use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walrecord::decode_wal_record;
use crate::walrecord::DecodedWALRecord;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
@@ -311,13 +310,11 @@ async fn import_wal(
let mut nrecords = 0;
let mut modification = tline.begin_modification(last_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
walingest
.ingest_record(decoded, lsn, &mut modification, ctx)
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
WAL_INGEST.records_committed.inc();
@@ -452,12 +449,11 @@ pub async fn import_wal_from_tar(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(last_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
walingest
.ingest_record(decoded, lsn, &mut modification, ctx)
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
modification.commit(ctx).await?;
last_lsn = lsn;

View File

@@ -1,10 +1,15 @@
use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
PageCached,
#[serde(rename_all = "snake_case")]
Direct { max_concurrency: NonZeroUsize },
Direct {
max_concurrency: NonZeroUsize,
},
}
impl Default for L0FlushConfig {
@@ -20,12 +25,14 @@ impl Default for L0FlushConfig {
pub struct L0FlushGlobalState(Arc<Inner>);
pub enum Inner {
PageCached,
Direct { semaphore: tokio::sync::Semaphore },
}
impl L0FlushGlobalState {
pub fn new(config: L0FlushConfig) -> Self {
match config {
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
L0FlushConfig::Direct { max_concurrency } => {
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
Self(Arc::new(Inner::Direct { semaphore }))
@@ -37,3 +44,13 @@ impl L0FlushGlobalState {
&self.0
}
}
impl L0FlushConfig {
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
use L0FlushConfig::*;
match self {
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
}
}
}

View File

@@ -16,7 +16,6 @@ pub mod l0_flush;
use futures::{stream::FuturesUnordered, StreamExt};
pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
mod assert_u64_eq_usize;
pub mod aux_file;
pub mod metrics;
pub mod page_cache;
@@ -50,7 +49,7 @@ use tracing::{info, info_span};
/// backwards-compatible changes to the metadata format.
pub const STORAGE_FORMAT_VERSION: u16 = 3;
pub const DEFAULT_PG_VERSION: u32 = 16;
pub const DEFAULT_PG_VERSION: u32 = 15;
// Magic constants used to identify different kinds of files
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -89,8 +88,6 @@ pub async fn shutdown_pageserver(
) {
use std::time::Duration;
let started_at = std::time::Instant::now();
// If the orderly shutdown below takes too long, we still want to make
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
//
@@ -244,10 +241,7 @@ pub async fn shutdown_pageserver(
walredo_extraordinary_shutdown_thread.join().unwrap();
info!("walredo_extraordinary_shutdown_thread done");
info!(
elapsed_ms = started_at.elapsed().as_millis(),
"Shut down successfully completed"
);
info!("Shut down successfully completed");
std::process::exit(exit_code);
}

View File

@@ -1552,6 +1552,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
pub(crate) enum ComputeCommandKind {
PageStreamV2,
PageStream,
Basebackup,
Fullbackup,
LeaseLsn,
@@ -1802,23 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_utilization_score",
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
)
.expect("failed to define a metric")
});
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_secondary_heatmap_total_size",
"The total size in bytes of all layers in the most recently downloaded heatmap.",
&["tenant_id", "shard_id"]
)
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1869,64 +1853,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub struct BackgroundLoopSemaphoreMetrics {
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
durations: EnumMap<BackgroundLoopKind, Counter>,
}
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|| {
let counters = register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap();
let durations = register_counter_vec!(
"pageserver_background_loop_semaphore_wait_duration_seconds",
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
&["task"],
)
.unwrap();
BackgroundLoopSemaphoreMetrics {
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
counters.with_label_values(&[kind.into()])
})),
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
durations.with_label_values(&[kind.into()])
})),
}
},
);
impl BackgroundLoopSemaphoreMetrics {
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
struct Record<'a> {
metrics: &'a BackgroundLoopSemaphoreMetrics,
task: BackgroundLoopKind,
_counter_guard: metrics::IntCounterPairGuard,
start: Instant,
}
impl Drop for Record<'_> {
fn drop(&mut self) {
let elapsed = self.start.elapsed().as_secs_f64();
self.metrics.durations[self.task].inc_by(elapsed);
}
}
Record {
metrics: self,
task,
_counter_guard: self.counters[task].guard(),
start: Instant::now(),
}
}
}
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
@@ -2608,7 +2544,6 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
use crate::tenant::tasks::BackgroundLoopKind;
/// Maintain a per timeline gauge in addition to the global gauge.
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {

View File

@@ -557,7 +557,7 @@ impl PageServerHandler {
pgb: &mut PostgresBackend<IO>,
tenant_id: TenantId,
timeline_id: TimelineId,
_protocol_version: PagestreamProtocolVersion,
protocol_version: PagestreamProtocolVersion,
ctx: RequestContext,
) -> Result<(), QueryError>
where
@@ -601,7 +601,8 @@ impl PageServerHandler {
fail::fail_point!("ps::handle-pagerequest-message");
// parse request
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let neon_fe_msg =
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
// invoke handler function
let (handler_result, span) = match neon_fe_msg {
@@ -753,21 +754,16 @@ impl PageServerHandler {
}
if request_lsn < **latest_gc_cutoff_lsn {
let gc_info = &timeline.gc_info.read().unwrap();
if !gc_info.leases.contains_key(&request_lsn) {
// The requested LSN is below gc cutoff and is not guarded by a lease.
// Check explicitly for INVALID just to get a less scary error message if the
// request is obviously bogus
return Err(if request_lsn == Lsn::INVALID {
PageStreamError::BadRequest("invalid LSN(0) in request".into())
} else {
PageStreamError::BadRequest(format!(
// Check explicitly for INVALID just to get a less scary error message if the
// request is obviously bogus
return Err(if request_lsn == Lsn::INVALID {
PageStreamError::BadRequest("invalid LSN(0) in request".into())
} else {
PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
request_lsn, **latest_gc_cutoff_lsn
).into())
});
}
});
}
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
@@ -794,8 +790,6 @@ impl PageServerHandler {
}
}
/// Handles the lsn lease request.
/// If a lease cannot be obtained, the client will receive NULL.
#[instrument(skip_all, fields(shard_id, %lsn))]
async fn handle_make_lsn_lease<IO>(
&mut self,
@@ -818,25 +812,19 @@ impl PageServerHandler {
.await?;
set_tracing_field_shard_id(&timeline);
let lease = timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
.inspect_err(|e| {
warn!("{e}");
})
.ok();
let valid_until_str = lease.map(|l| {
l.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
.expect("valid_until is earlier than UNIX_EPOCH")
.as_millis()
.to_string()
});
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
let valid_until = lease
.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
.map_err(|e| QueryError::Other(e.into()))?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
b"valid_until",
)]))?
.write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
.write_message_noflush(&BeMessage::DataRow(&[Some(
&valid_until.as_millis().to_be_bytes(),
)]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
Ok(())
}
@@ -1287,6 +1275,35 @@ where
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
if params.len() != 2 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for pagestream command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
let timeline_id = TimelineId::from_str(params[1])
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
tracing::Span::current()
.record("tenant_id", field::display(tenant_id))
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStream)
.inc();
self.handle_pagerequests(
pgb,
tenant_id,
timeline_id,
PagestreamProtocolVersion::V1,
ctx,
)
.await?;
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
if params.len() < 2 {
return Err(QueryError::Other(anyhow::anyhow!(

View File

@@ -15,11 +15,12 @@ use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
use pageserver_api::key::{
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::AuxFilePolicy;
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::pausable_failpoint;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -168,13 +170,10 @@ impl Timeline {
DatadirModification {
tline: self,
pending_lsns: Vec::new(),
pending_metadata_pages: HashMap::new(),
pending_data_pages: Vec::new(),
pending_zero_data_pages: Default::default(),
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
pending_directory_entries: Vec::new(),
pending_bytes: 0,
lsn,
}
}
@@ -288,7 +287,10 @@ impl Timeline {
// then check if the database was already initialized.
// get_rel_exists can be called before dbdir is created.
let buf = version.get(self, DBDIR_KEY, ctx).await?;
let dbdirs = DbDirectory::des(&buf)?.dbdirs;
let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.dbdirs),
Err(e) => Err(PageReconstructError::from(e)),
}?;
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
return Ok(false);
}
@@ -296,8 +298,13 @@ impl Timeline {
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
Ok(exists)
}
Err(e) => Err(PageReconstructError::from(e)),
}
}
/// Get a list of all existing relations in given tablespace and database.
@@ -316,16 +323,20 @@ impl Timeline {
let key = rel_dir_to_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let dir = RelDirectory::des(&buf)?;
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
Ok(rels)
Ok(rels)
}
Err(e) => Err(PageReconstructError::from(e)),
}
}
/// Get the whole SLRU segment
@@ -387,8 +398,13 @@ impl Timeline {
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
let dir = SlruSegmentDirectory::des(&buf)?;
Ok(dir.segments.contains(&segno))
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.segments.contains(&segno);
Ok(exists)
}
Err(e) => Err(PageReconstructError::from(e)),
}
}
/// Locate LSN, such that all transactions that committed before
@@ -604,7 +620,10 @@ impl Timeline {
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
Ok(SlruSegmentDirectory::des(&buf)?.segments)
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.segments),
Err(e) => Err(PageReconstructError::from(e)),
}
}
pub(crate) async fn get_relmap_file(
@@ -628,7 +647,10 @@ impl Timeline {
// fetch directory entry
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
Ok(DbDirectory::des(&buf)?.dbdirs)
match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.dbdirs),
Err(e) => Err(PageReconstructError::from(e)),
}
}
pub(crate) async fn get_twophase_file(
@@ -650,7 +672,10 @@ impl Timeline {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
Ok(TwoPhaseDirectory::des(&buf)?.xids)
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.xids),
Err(e) => Err(PageReconstructError::from(e)),
}
}
pub(crate) async fn get_control_file(
@@ -675,7 +700,10 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
match self.get(AUX_FILES_KEY, lsn, ctx).await {
Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.files),
Err(e) => Err(PageReconstructError::from(e)),
},
Err(e) => {
// This is expected: historical databases do not have the key.
debug!("Failed to get info about AUX files: {}", e);
@@ -691,14 +719,13 @@ impl Timeline {
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let kv = self
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
.await?;
.await
.context("scan")?;
let mut result = HashMap::new();
let mut sz = 0;
for (_, v) in kv {
let v = v?;
let v = aux_file::decode_file_value_bytes(&v)
.context("value decode")
.map_err(PageReconstructError::Other)?;
let v = v.context("get value")?;
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
for (fname, content) in v {
sz += fname.len();
sz += content.len();
@@ -728,17 +755,7 @@ impl Timeline {
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let current_policy = self.last_aux_file_policy.load();
match current_policy {
Some(AuxFilePolicy::V1) => {
warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
self.list_aux_files_v1(lsn, ctx).await
}
None => {
let res = self.list_aux_files_v1(lsn, ctx).await?;
if !res.is_empty() {
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
}
Ok(res)
}
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
Some(AuxFilePolicy::CrossValidation) => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -776,10 +793,11 @@ impl Timeline {
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
let kv = self
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
.await?;
.await
.context("scan")?;
let mut result = HashMap::new();
for (k, v) in kv {
let v = v?;
let v = v.context("get value")?;
let origin_id = k.field6 as RepOriginId;
let origin_lsn = Lsn::des(&v).unwrap();
if origin_lsn != Lsn::INVALID {
@@ -1033,51 +1051,21 @@ pub struct DatadirModification<'a> {
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
pending_lsns: Vec<Lsn>,
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
pending_deletions: Vec<(Range<Key>, Lsn)>,
pending_nblocks: i64,
/// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
/// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
/// which keys are stored here.
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
// if we encounter a write from postgres in the same wal record, we will drop this entry.
//
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
pending_zero_data_pages: HashSet<CompactKey>,
/// For special "directory" keys that store key-value maps, track the size of the map
/// if it was updated in this modification.
pending_directory_entries: Vec<(DirectoryKind, usize)>,
/// An **approximation** of how large our EphemeralFile write will be when committed.
pending_bytes: usize,
}
impl<'a> DatadirModification<'a> {
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
/// Get the current lsn
pub(crate) fn get_lsn(&self) -> Lsn {
self.lsn
}
pub(crate) fn approx_pending_bytes(&self) -> usize {
self.pending_bytes
}
pub(crate) fn has_dirty_data_pages(&self) -> bool {
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
@@ -1086,10 +1074,6 @@ impl<'a> DatadirModification<'a> {
lsn,
self.lsn
);
// If we are advancing LSN, then state from previous wal record should have been flushed.
assert!(self.pending_zero_data_pages.is_empty());
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
@@ -1097,17 +1081,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
/// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
/// keys that represent literal blocks that postgres can read. So data includes relation blocks and
/// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
///
/// The distinction is important because data keys are handled on a fast path where dirty writes are
/// not readable until this modification is committed, whereas metadata keys are visible for read
/// via [`Self::get`] as soon as their record has been ingested.
fn is_data_key(key: &Key) -> bool {
key.is_rel_block_key() || key.is_slru_block_key()
}
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -1215,31 +1188,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
self.pending_zero_data_pages
.insert(rel_block_to_key(rel, blknum).to_compact());
self.pending_bytes += ZERO_PAGE.len();
}
pub(crate) fn put_slru_page_image_zero(
&mut self,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
) {
self.pending_zero_data_pages
.insert(slru_block_to_key(kind, segno, blknum).to_compact());
self.pending_bytes += ZERO_PAGE.len();
}
/// Call this at the end of each WAL record.
pub(crate) fn on_record_end(&mut self) {
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
for key in pending_zero_data_pages {
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
}
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub async fn put_relmap_file(
&mut self,
@@ -1657,7 +1605,6 @@ impl<'a> DatadirModification<'a> {
if aux_files_key_v1.is_empty() {
None
} else {
warn!("this timeline is using deprecated aux file policy V1");
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
Some(AuxFilePolicy::V1)
}
@@ -1786,17 +1733,12 @@ impl<'a> DatadirModification<'a> {
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
e @ (PageReconstructError::Other(_)
PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey(_)),
| PageReconstructError::MissingKey { .. },
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
if !matches!(e, PageReconstructError::MissingKey(_)) {
let e = utils::error::report_compact_sources(&e);
tracing::warn!("treating error as if it was a missing key: {}", e);
}
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
@@ -1838,7 +1780,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -1849,12 +1791,23 @@ impl<'a> DatadirModification<'a> {
let mut writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
for (key, values) in self.pending_updates.drain() {
for (lsn, value) in values {
if key.is_rel_block_key() || key.is_slru_block_key() {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, lsn, &value, ctx).await?;
} else {
retained_pending_updates
.entry(key)
.or_default()
.push((lsn, value));
}
}
}
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put_batch(pending_data_pages, ctx).await?;
self.pending_bytes = 0;
self.pending_updates = retained_pending_updates;
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1874,31 +1827,23 @@ impl<'a> DatadirModification<'a> {
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
// Commit should never be called mid-wal-record
assert!(self.pending_zero_data_pages.is_empty());
let mut writer = self.tline.writer().await;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
// Ordering: the items in this batch do not need to be in any global order, but values for
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
// this to do efficient updates to its index.
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
if !self.pending_updates.is_empty() {
// The put_batch call below expects expects the inputs to be sorted by Lsn,
// so we do that first.
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
self.pending_updates
.drain()
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
VecMapOrdering::GreaterOrEqual,
);
write_batch.extend(
self.pending_metadata_pages
.drain()
.flat_map(|(key, values)| {
values
.into_iter()
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
}),
);
if !write_batch.is_empty() {
writer.put_batch(write_batch, ctx).await?;
writer.put_batch(lsn_ordered_batch, ctx).await?;
}
if !self.pending_deletions.is_empty() {
@@ -1923,64 +1868,37 @@ impl<'a> DatadirModification<'a> {
writer.update_directory_entries_count(kind, count as u64);
}
self.pending_bytes = 0;
Ok(())
}
pub(crate) fn len(&self) -> usize {
self.pending_metadata_pages.len()
+ self.pending_data_pages.len()
+ self.pending_deletions.len()
self.pending_updates.len() + self.pending_deletions.len()
}
/// Read a page from the Timeline we are writing to. For metadata pages, this passes through
/// a cache in Self, which makes writes earlier in this modification visible to WAL records later
/// in the modification.
///
/// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
/// page must ensure that the pages they read are already committed in Timeline, for example
/// DB create operations are always preceded by a call to commit(). This is special cased because
/// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
/// and not data pages.
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
if !Self::is_data_key(&key) {
// Have we already updated the same key? Read the latest pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
if let Some((_, _, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::Other(anyhow::anyhow!(
"unexpected pending WAL record"
)))
};
}
}
} else {
// This is an expensive check, so we only do it in debug mode. If reading a data key,
// this key should never be present in pending_data_pages. We ensure this by committing
// modifications before ingesting DB create operations, which are the only kind that reads
// data pages during ingest.
if cfg!(debug_assertions) {
for (dirty_key, _, _, _) in &self.pending_data_pages {
debug_assert!(&key.to_compact() != dirty_key);
}
// Internal helper functions to batch the modifications
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
// Have we already updated the same key? Read the latest pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
if let Some(values) = self.pending_updates.get(&key) {
if let Some((_, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
};
}
}
// Metadata page cache miss, or we're reading a data page.
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
@@ -1992,48 +1910,15 @@ impl<'a> DatadirModification<'a> {
}
fn put(&mut self, key: Key, val: Value) {
if Self::is_data_key(&key) {
self.put_data(key.to_compact(), val)
} else {
self.put_metadata(key.to_compact(), val)
}
}
fn put_data(&mut self, key: CompactKey, val: Value) {
let val_serialized_size = val.serialized_size().unwrap() as usize;
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
// and the subsequent postgres-originating write
if self.pending_zero_data_pages.remove(&key) {
self.pending_bytes -= ZERO_PAGE.len();
}
self.pending_bytes += val_serialized_size;
self.pending_data_pages
.push((key, self.lsn, val_serialized_size, val))
}
fn put_metadata(&mut self, key: CompactKey, val: Value) {
let values = self.pending_metadata_pages.entry(key).or_default();
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
if let Some((last_lsn, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
// Update the pending_bytes contribution from this entry, and update the serialized size in place
self.pending_bytes -= *last_value_ser_size;
*last_value_ser_size = val.serialized_size().unwrap() as usize;
self.pending_bytes += *last_value_ser_size;
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
// have been generated by synthesized zero page writes prior to the first real write to a page.
*last_value = val;
return;
}
}
let val_serialized_size = val.serialized_size().unwrap() as usize;
self.pending_bytes += val_serialized_size;
values.push((self.lsn, val_serialized_size, val));
values.push((self.lsn, val));
}
fn delete(&mut self, key_range: Range<Key>) {
@@ -2163,7 +2048,7 @@ mod tests {
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = tline.raw_timeline().unwrap();

View File

@@ -146,12 +146,6 @@ impl FromStr for TokioRuntimeMode {
}
}
static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
// the default 2MiB are insufficent, especially in debug mode
.unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
});
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
let thread_name = "pageserver-tokio";
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
@@ -170,7 +164,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
tokio::runtime::Builder::new_current_thread()
.thread_name(thread_name)
.enable_all()
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
.build()
.expect("failed to create one single runtime")
}
@@ -180,7 +173,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
.thread_name(thread_name)
.enable_all()
.worker_threads(num_workers.get())
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
.build()
.expect("failed to create one multi-threaded runtime")
}
@@ -207,7 +199,6 @@ macro_rules! pageserver_runtime {
.thread_name($name)
.worker_threads(TOKIO_WORKER_THREADS.get())
.enable_all()
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
.build()
.expect(std::concat!("Failed to create runtime ", $name))
});
@@ -402,7 +393,7 @@ struct PageServerTask {
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_shard_id: TenantShardId,
tenant_shard_id: Option<TenantShardId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>,
@@ -414,7 +405,7 @@ struct PageServerTask {
pub fn spawn<F>(
runtime: &tokio::runtime::Handle,
kind: TaskKind,
tenant_shard_id: TenantShardId,
tenant_shard_id: Option<TenantShardId>,
timeline_id: Option<TimelineId>,
name: &str,
future: F,
@@ -559,7 +550,7 @@ pub async fn shutdown_tasks(
let tasks = TASKS.lock().unwrap();
for task in tasks.values() {
if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
{
task.cancel.cancel();
@@ -582,8 +573,13 @@ pub async fn shutdown_tasks(
};
if let Some(mut join_handle) = join_handle {
if log_all {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
if tenant_shard_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
.await

View File

@@ -41,7 +41,6 @@ use tokio::sync::watch;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use upload_queue::NotInitialized;
use utils::backoff;
use utils::circuit_breaker::CircuitBreaker;
use utils::completion;
@@ -302,11 +301,7 @@ pub struct Tenant {
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach concurrency limiter.
///
/// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
/// to have two running at the same time. A different one can be started if an earlier one
/// has failed for whatever reason.
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
/// `index_part.json` based gc blocking reason tracking.
@@ -501,42 +496,6 @@ impl Debug for DeleteTimelineError {
}
}
#[derive(thiserror::Error)]
pub enum TimelineArchivalError {
#[error("NotFound")]
NotFound,
#[error("Timeout")]
Timeout,
#[error("ancestor is archived: {}", .0)]
HasArchivedParent(TimelineId),
#[error("HasUnarchivedChildren")]
HasUnarchivedChildren(Vec<TimelineId>),
#[error("Timeline archival is already in progress")]
AlreadyInProgress,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl Debug for TimelineArchivalError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound => write!(f, "NotFound"),
Self::Timeout => write!(f, "Timeout"),
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
Self::HasUnarchivedChildren(c) => {
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
}
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
}
}
}
pub enum SetStoppingError {
AlreadyStopping(completion::Barrier),
Broken,
@@ -642,15 +601,6 @@ impl From<PageReconstructError> for GcError {
}
}
impl From<NotInitialized> for GcError {
fn from(value: NotInitialized) -> Self {
match value {
NotInitialized::Uninitialized => GcError::Remote(value.into()),
NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
}
}
}
impl From<timeline::layer_manager::Shutdown> for GcError {
fn from(_: timeline::layer_manager::Shutdown) -> Self {
GcError::TimelineCancelled
@@ -834,7 +784,7 @@ impl Tenant {
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Attach,
tenant_shard_id,
Some(tenant_shard_id),
None,
"attach tenant",
async move {
@@ -873,20 +823,14 @@ impl Tenant {
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
// if it errors, we will call make_broken when tenant is already in Stopping.
assert!(
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
"the attach task owns the tenant state until activation is complete"
);
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
"the attach task owns the tenant state until activation is complete"
);
*state = TenantState::broken_from_reason(err.to_string());
});
};
// TODO: should also be rejecting tenant conf changes that violate this check.
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
return Ok(());
}
let mut init_order = init_order;
// take the completion because initial tenant loading will complete when all of
// these tasks complete.
@@ -1368,59 +1312,24 @@ impl Tenant {
&self,
timeline_id: TimelineId,
state: TimelineArchivalState,
) -> Result<(), TimelineArchivalError> {
info!("setting timeline archival config");
let timeline = {
let timelines = self.timelines.lock().unwrap();
let Some(timeline) = timelines.get(&timeline_id) else {
return Err(TimelineArchivalError::NotFound);
};
if state == TimelineArchivalState::Unarchived {
if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
if ancestor_timeline.is_archived() == Some(true) {
return Err(TimelineArchivalError::HasArchivedParent(
ancestor_timeline.timeline_id,
));
}
}
}
// Ensure that there are no non-archived child timelines
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
return None;
}
if entry.is_archived() == Some(true) {
return None;
}
Some(*id)
})
.collect();
if !children.is_empty() && state == TimelineArchivalState::Archived {
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
}
Arc::clone(timeline)
};
) -> anyhow::Result<()> {
let timeline = self
.get_timeline(timeline_id, false)
.context("Cannot apply timeline archival config to inexistent timeline")?;
let upload_needed = timeline
.remote_client
.schedule_index_upload_for_timeline_archival_state(state)?;
if upload_needed {
info!("Uploading new state");
const MAX_WAIT: Duration = Duration::from_secs(10);
let Ok(v) =
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
else {
tracing::warn!("reached timeout for waiting on upload queue");
return Err(TimelineArchivalError::Timeout);
bail!("reached timeout for upload queue flush");
};
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
v?;
}
Ok(())
}
@@ -3813,27 +3722,6 @@ impl Tenant {
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
self.tenant_conf.load().tenant_conf.clone()
}
/// How much local storage would this tenant like to have? It can cope with
/// less than this (via eviction and on-demand downloads), but this function enables
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
/// by keeping important things on local disk.
///
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
/// than they report here, due to layer eviction. Tenants with many active branches may
/// actually use more than they report here.
pub(crate) fn local_storage_wanted(&self) -> u64 {
let timelines = self.timelines.lock().unwrap();
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
// reflects the observation that on tenants with multiple large branches, typically only one
// of them is used actively enough to occupy space on disk.
timelines
.values()
.map(|t| t.metrics.visible_physical_size_gauge.get())
.max()
.unwrap_or(0)
}
}
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -4576,13 +4464,10 @@ mod tests {
// This needs to traverse to the parent, and fails.
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
assert!(
err.to_string().starts_with(&format!(
"bad state on timeline {}: Broken",
tline.timeline_id
)),
"{err}"
);
assert!(err.to_string().starts_with(&format!(
"Bad state on timeline {}: Broken",
tline.timeline_id
)));
Ok(())
}
@@ -6017,10 +5902,10 @@ mod tests {
.await
.unwrap();
// the default aux file policy to switch is v2 if not set by the admins
// the default aux file policy to switch is v1 if not set by the admins
assert_eq!(
harness.tenant_conf.switch_aux_file_policy,
AuxFilePolicy::default_tenant_config()
AuxFilePolicy::V1
);
let (tenant, ctx) = harness.load().await;
@@ -6064,8 +5949,8 @@ mod tests {
);
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V2),
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
Some(AuxFilePolicy::V1),
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
);
// we can read everything from the storage
@@ -6087,8 +5972,8 @@ mod tests {
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V2),
"keep v2 storage format when new files are written"
Some(AuxFilePolicy::V1),
"keep v1 storage format when new files are written"
);
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6104,7 +5989,7 @@ mod tests {
// child copies the last flag even if that is not on remote storage yet
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(files.get("pg_logical/mappings/test1"), None);
@@ -7090,14 +6975,18 @@ mod tests {
vec![
// Image layer at GC horizon
PersistentLayerKey {
key_range: Key::MIN..Key::NON_L0_MAX,
key_range: {
let mut key = Key::MAX;
key.field6 -= 1;
Key::MIN..key
},
lsn_range: Lsn(0x30)..Lsn(0x31),
is_delta: false
},
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
// The delta layer that is cut in the middle
PersistentLayerKey {
key_range: Key::MIN..Key::NON_L0_MAX,
lsn_range: Lsn(0x30)..Lsn(0x48),
key_range: get_key(3)..get_key(4),
lsn_range: Lsn(0x30)..Lsn(0x41),
is_delta: true
},
// The delta3 layer that should not be picked for the compaction
@@ -8077,214 +7966,6 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
{
let harness =
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
.await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let img_layer = (0..10)
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
.collect_vec();
let delta1 = vec![
(
get_key(1),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(1),
Lsn(0x28),
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
),
];
let delta2 = vec![
(
get_key(1),
Lsn(0x30),
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
),
(
get_key(1),
Lsn(0x38),
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
),
];
let delta3 = vec![
(
get_key(8),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
(
get_key(9),
Lsn(0x48),
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
),
];
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![
// delta1 and delta 2 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
.await?;
{
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![
(Lsn(0x10), tline.timeline_id),
(Lsn(0x20), tline.timeline_id),
],
cutoffs: GcCutoffs {
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
let expected_result = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
Bytes::from_static(b"value 2@0x10"),
Bytes::from_static(b"value 3@0x10"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10"),
Bytes::from_static(b"value 6@0x10"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10@0x48"),
Bytes::from_static(b"value 9@0x10@0x48"),
];
let expected_result_at_gc_horizon = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
Bytes::from_static(b"value 2@0x10"),
Bytes::from_static(b"value 3@0x10"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10"),
Bytes::from_static(b"value 6@0x10"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10"),
Bytes::from_static(b"value 9@0x10"),
];
let expected_result_at_lsn_20 = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10@0x20"),
Bytes::from_static(b"value 2@0x10"),
Bytes::from_static(b"value 3@0x10"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10"),
Bytes::from_static(b"value 6@0x10"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10"),
Bytes::from_static(b"value 9@0x10"),
];
let expected_result_at_lsn_10 = [
Bytes::from_static(b"value 0@0x10"),
Bytes::from_static(b"value 1@0x10"),
Bytes::from_static(b"value 2@0x10"),
Bytes::from_static(b"value 3@0x10"),
Bytes::from_static(b"value 4@0x10"),
Bytes::from_static(b"value 5@0x10"),
Bytes::from_static(b"value 6@0x10"),
Bytes::from_static(b"value 7@0x10"),
Bytes::from_static(b"value 8@0x10"),
Bytes::from_static(b"value 9@0x10"),
];
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
tline
.get(get_key(idx as u32), Lsn(0x50), &ctx)
.await
.unwrap(),
&expected_result[idx]
);
assert_eq!(
tline
.get(get_key(idx as u32), gc_horizon, &ctx)
.await
.unwrap(),
&expected_result_at_gc_horizon[idx]
);
assert_eq!(
tline
.get(get_key(idx as u32), Lsn(0x20), &ctx)
.await
.unwrap(),
&expected_result_at_lsn_20[idx]
);
assert_eq!(
tline
.get(get_key(idx as u32), Lsn(0x10), &ctx)
.await
.unwrap(),
&expected_result_at_lsn_10[idx]
);
}
};
verify_result().await;
let cancel = CancellationToken::new();
let mut dryrun_flags = EnumSet::new();
dryrun_flags.insert(CompactFlags::DryRun);
tline
.compact_with_gc(&cancel, dryrun_flags, &ctx)
.await
.unwrap();
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
verify_result().await;
tline
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await;
// compact again
tline
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
.await
.unwrap();
verify_result().await;
Ok(())
}
#[tokio::test]
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;

View File

@@ -24,7 +24,6 @@ use tracing::warn;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
use crate::virtual_file::VirtualFile;
use std::cmp::min;
use std::io::{Error, ErrorKind};
@@ -148,7 +147,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
/// The maximum size of blobs we support. The highest few bits
/// are reserved for compression and other further uses.
pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
@@ -187,11 +186,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
/// You need to make sure that the internal buffer is empty, otherwise
/// data will be written in wrong order.
#[inline(always)]
async fn write_all_unbuffered<Buf: IoBuf + Send>(
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
src_buf: FullSlice<Buf>,
src_buf: B,
ctx: &RequestContext,
) -> (FullSlice<Buf>, Result<(), Error>) {
) -> (B::Buf, Result<(), Error>) {
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
let nbytes = match res {
Ok(nbytes) => nbytes,
@@ -205,9 +204,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
/// Flushes the internal buffer to the underlying `VirtualFile`.
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
let buf = std::mem::take(&mut self.buf);
let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
res?;
let mut buf = slice.into_raw_slice().into_inner();
buf.clear();
self.buf = buf;
Ok(())
@@ -224,30 +222,19 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
}
/// Internal, possibly buffered, write function
async fn write_all<Buf: IoBuf + Send>(
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
src_buf: FullSlice<Buf>,
src_buf: B,
ctx: &RequestContext,
) -> (FullSlice<Buf>, Result<(), Error>) {
let src_buf = src_buf.into_raw_slice();
let src_buf_bounds = src_buf.bounds();
let restore = move |src_buf_slice: Slice<_>| {
FullSlice::must_new(Slice::from_buf_bounds(
src_buf_slice.into_inner(),
src_buf_bounds,
))
};
) -> (B::Buf, Result<(), Error>) {
if !BUFFERED {
assert!(self.buf.is_empty());
return self
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
.await;
return self.write_all_unbuffered(src_buf, ctx).await;
}
let remaining = Self::CAPACITY - self.buf.len();
let src_buf_len = src_buf.bytes_init();
if src_buf_len == 0 {
return (restore(src_buf), Ok(()));
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
}
let mut src_buf = src_buf.slice(0..src_buf_len);
// First try to copy as much as we can into the buffer
@@ -258,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
// Then, if the buffer is full, flush it out
if self.buf.len() == Self::CAPACITY {
if let Err(e) = self.flush_buffer(ctx).await {
return (restore(src_buf), Err(e));
return (Slice::into_inner(src_buf), Err(e));
}
}
// Finally, write the tail of src_buf:
@@ -271,29 +258,27 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
let copied = self.write_into_buffer(&src_buf);
// We just verified above that src_buf fits into our internal buffer.
assert_eq!(copied, src_buf.len());
restore(src_buf)
Slice::into_inner(src_buf)
} else {
let (src_buf, res) = self
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
.await;
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
if let Err(e) = res {
return (src_buf, Err(e));
}
src_buf
}
} else {
restore(src_buf)
Slice::into_inner(src_buf)
};
(src_buf, Ok(()))
}
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub async fn write_blob<Buf: IoBuf + Send>(
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
srcbuf: FullSlice<Buf>,
srcbuf: B,
ctx: &RequestContext,
) -> (FullSlice<Buf>, Result<u64, Error>) {
) -> (B::Buf, Result<u64, Error>) {
let (buf, res) = self
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await;
@@ -302,40 +287,43 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
/// Write a blob of data. Returns the offset that it was written to,
/// which can be used to retrieve the data later.
pub(crate) async fn write_blob_maybe_compressed<Buf: IoBuf + Send>(
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
&mut self,
srcbuf: FullSlice<Buf>,
srcbuf: B,
ctx: &RequestContext,
algorithm: ImageCompressionAlgorithm,
) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
let offset = self.offset;
let mut compression_info = CompressionInfo {
written_compressed: false,
compressed_size: None,
};
let len = srcbuf.len();
let len = srcbuf.bytes_init();
let mut io_buf = self.io_buf.take().expect("we always put it back below");
io_buf.clear();
let mut compressed_buf = None;
let ((io_buf_slice, hdr_res), srcbuf) = async {
let ((io_buf, hdr_res), srcbuf) = async {
if len < 128 {
// Short blob. Write a 1-byte length header
io_buf.put_u8(len as u8);
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
(
self.write_all(io_buf, ctx).await,
srcbuf.slice_full().into_inner(),
)
} else {
// Write a 4-byte length header
if len > MAX_SUPPORTED_BLOB_LEN {
if len > MAX_SUPPORTED_LEN {
return (
(
io_buf.slice_len(),
io_buf,
Err(Error::new(
ErrorKind::Other,
format!("blob too large ({len} bytes)"),
)),
),
srcbuf,
srcbuf.slice_full().into_inner(),
);
}
let (high_bit_mask, len_written, srcbuf) = match algorithm {
@@ -348,7 +336,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
} else {
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
};
encoder.write_all(&srcbuf[..]).await.unwrap();
let slice = srcbuf.slice_full();
encoder.write_all(&slice[..]).await.unwrap();
encoder.shutdown().await.unwrap();
let compressed = encoder.into_inner();
compression_info.compressed_size = Some(compressed.len());
@@ -356,29 +345,31 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
compression_info.written_compressed = true;
let compressed_len = compressed.len();
compressed_buf = Some(compressed);
(BYTE_ZSTD, compressed_len, srcbuf)
(BYTE_ZSTD, compressed_len, slice.into_inner())
} else {
(BYTE_UNCOMPRESSED, len, srcbuf)
(BYTE_UNCOMPRESSED, len, slice.into_inner())
}
}
ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf),
ImageCompressionAlgorithm::Disabled => {
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
}
};
let mut len_buf = (len_written as u32).to_be_bytes();
assert_eq!(len_buf[0] & 0xf0, 0);
len_buf[0] |= high_bit_mask;
io_buf.extend_from_slice(&len_buf[..]);
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
(self.write_all(io_buf, ctx).await, srcbuf)
}
}
.await;
self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner());
self.io_buf = Some(io_buf);
match hdr_res {
Ok(_) => (),
Err(e) => return (srcbuf, Err(e)),
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
}
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await;
(srcbuf, res)
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
(Slice::into_inner(srcbuf.slice(..)), res)
} else {
self.write_all(srcbuf, ctx).await
};
@@ -441,21 +432,21 @@ pub(crate) mod tests {
let (_, res) = if compression {
let res = wtr
.write_blob_maybe_compressed(
blob.clone().slice_len(),
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await;
(res.0, res.1.map(|(off, _)| off))
} else {
wtr.write_blob(blob.clone().slice_len(), ctx).await
wtr.write_blob(blob.clone(), ctx).await
};
let offs = res?;
offsets.push(offs);
}
// Write out one page worth of zeros so that we can
// read again with read_blk
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
let offs = res?;
println!("Writing final blob at offs={offs}");
wtr.flush_buffer(ctx).await?;

View File

@@ -2,6 +2,7 @@
//! Low-level Block-oriented I/O functions
//!
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -80,7 +81,9 @@ impl<'a> Deref for BlockLease<'a> {
/// Unlike traits, we also support the read function to be async though.
pub(crate) enum BlockReaderRef<'a> {
FileBlockReader(&'a FileBlockReader<'a>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
Slice(&'a [u8]),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
#[cfg(test)]
@@ -97,7 +100,9 @@ impl<'a> BlockReaderRef<'a> {
use BlockReaderRef::*;
match self {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
Slice(s) => Self::read_blk_slice(s, blknum),
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
@@ -106,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
}
}
impl<'a> BlockReaderRef<'a> {
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
let end = start.checked_add(PAGE_SZ).unwrap();
if end > slice.len() {
return Err(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
format!("slice too short, len={} end={}", slice.len(), end),
));
}
let slice = &slice[start..end];
let page_sized: &[u8; PAGE_SZ] = slice
.try_into()
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
Ok(BlockLease::Slice(page_sized))
}
}
///
/// A "cursor" for efficiently reading multiple pages from a BlockReader
///

View File

@@ -1,21 +1,13 @@
//! Implementation of append-only file data structure
//! used to keep in-memory layers spilled on disk.
use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache;
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
use crate::virtual_file::owned_buffers_io::write::Buffer;
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
use bytes::BytesMut;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::{self, VirtualFile};
use camino::Utf8PathBuf;
use num_traits::Num;
use pageserver_api::shard::TenantShardId;
use tokio_epoll_uring::{BoundedBuf, Slice};
use tracing::error;
use std::io;
use std::sync::atomic::AtomicU64;
@@ -24,17 +16,13 @@ use utils::id::TimelineId;
pub struct EphemeralFile {
_tenant_shard_id: TenantShardId,
_timeline_id: TimelineId,
page_cache_file_id: page_cache::FileId,
bytes_written: u64,
buffered_writer: owned_buffers_io::write::BufferedWriter<
BytesMut,
size_tracking_writer::Writer<VirtualFile>,
>,
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
_gate_guard: utils::sync::gate::GateGuard,
rw: page_caching::RW,
}
const TAIL_SZ: usize = 64 * 1024;
mod page_caching;
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
mod zero_padded_read_write;
impl EphemeralFile {
pub async fn create(
@@ -64,178 +52,62 @@ impl EphemeralFile {
)
.await?;
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
let prewarm = conf.l0_flush.prewarm_on_write();
Ok(EphemeralFile {
_tenant_shard_id: tenant_shard_id,
_timeline_id: timeline_id,
page_cache_file_id,
bytes_written: 0,
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
size_tracking_writer::Writer::new(file),
BytesMut::with_capacity(TAIL_SZ),
),
_gate_guard: gate_guard,
rw: page_caching::RW::new(file, prewarm, gate_guard),
})
}
}
impl Drop for EphemeralFile {
fn drop(&mut self) {
// unlink the file
// we are clear to do this, because we have entered a gate
let path = &self.buffered_writer.as_inner().as_inner().path;
let res = std::fs::remove_file(path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!("could not remove ephemeral file '{path}': {e}");
}
}
}
}
impl EphemeralFile {
pub(crate) fn len(&self) -> u64 {
self.bytes_written
self.rw.bytes_written()
}
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
self.page_cache_file_id
self.rw.page_cache_file_id()
}
/// See [`self::page_caching::RW::load_to_vec`].
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
let size = self.len().into_usize();
let vec = Vec::with_capacity(size);
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
assert_eq!(nread, size);
let vec = slice.into_inner();
assert_eq!(vec.len(), nread);
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
Ok(vec)
self.rw.load_to_vec(ctx).await
}
/// Returns the offset at which the first byte of the input was written, for use
/// in constructing indices over the written value.
///
/// Panics if the write is short because there's no way we can recover from that.
/// TODO: make upstack handle this as an error.
pub(crate) async fn write_raw(
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
self.rw.read_blk(blknum, ctx).await
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<u64> {
let pos = self.bytes_written;
) -> Result<u64, io::Error> {
let pos = self.rw.bytes_written();
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
srcbuf_len = srcbuf.len(),
),
)
})?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
self.rw.write_all_borrowed(&len_buf, ctx).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
self.rw.write_all_borrowed(&len_buf, ctx).await?;
}
// Write the payload
let nwritten = self
.buffered_writer
.write_buffered_borrowed(srcbuf, ctx)
.await?;
assert_eq!(
nwritten,
srcbuf.len(),
"buffered writer has no short writes"
);
self.bytes_written = new_bytes_written;
self.rw.write_all_borrowed(srcbuf, ctx).await?;
Ok(pos)
}
}
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
&'b self,
start: u64,
dst: tokio_epoll_uring::Slice<B>,
ctx: &'a RequestContext,
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
let file_size_tracking_writer = self.buffered_writer.as_inner();
let flushed_offset = file_size_tracking_writer.bytes_written();
let buffer = self.buffered_writer.inspect_buffer();
let buffered = &buffer[0..buffer.pending()];
let dst_cap = dst.bytes_total().into_u64();
let end = {
// saturating_add is correct here because the max file size is u64::MAX, so,
// if start + dst.len() > u64::MAX, then we know it will be a short read
let mut end: u64 = start.saturating_add(dst_cap);
if end > self.bytes_written {
end = self.bytes_written;
}
end
};
// inclusive, exclusive
#[derive(Debug)]
struct Range<N>(N, N);
impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
fn len(&self) -> N {
if self.0 > self.1 {
N::zero()
} else {
self.1 - self.0
}
}
}
let written_range = Range(start, std::cmp::min(end, flushed_offset));
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
let dst = if written_range.len() > 0 {
let file: &VirtualFile = file_size_tracking_writer.as_inner();
let bounds = dst.bounds();
let slice = file
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
.await?;
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
} else {
dst
};
let dst = if buffered_range.len() > 0 {
let offset_in_buffer = buffered_range
.0
.checked_sub(flushed_offset)
.unwrap()
.into_usize();
let to_copy =
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
let bounds = dst.bounds();
let mut view = dst.slice({
let start = written_range.len().into_usize();
let end = start
.checked_add(buffered_range.len().into_usize())
.unwrap();
start..end
});
view.as_mut_rust_slice_full_zeroed()
.copy_from_slice(to_copy);
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
} else {
dst
};
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
Ok((dst, (end - start).into_usize()))
}
}
/// Does the given filename look like an ephemeral file?
pub fn is_ephemeral_file(filename: &str) -> bool {
if let Some(rest) = filename.strip_prefix("ephemeral-") {
@@ -245,13 +117,19 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl BlockReader for EphemeralFile {
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
}
}
#[cfg(test)]
mod tests {
use rand::Rng;
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::BlockReaderRef;
use rand::{thread_rng, RngCore};
use std::fs;
use std::str::FromStr;
@@ -282,6 +160,69 @@ mod tests {
Ok((conf, tenant_shard_id, timeline_id, ctx))
}
#[tokio::test]
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let gate = utils::sync::gate::Gate::default();
let entered = gate.enter().unwrap();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
);
let pos_bar = file.write_blob(b"bar", &ctx).await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
);
assert_eq!(
b"bar",
file.block_cursor()
.read_blob(pos_bar, &ctx)
.await?
.as_slice()
);
let mut blobs = Vec::new();
for i in 0..10000 {
let data = Vec::from(format!("blob{}", i).as_bytes());
let pos = file.write_blob(&data, &ctx).await?;
blobs.push((pos, data));
}
// also test with a large blobs
for i in 0..100 {
let data = format!("blob{}", i).as_bytes().repeat(100);
let pos = file.write_blob(&data, &ctx).await?;
blobs.push((pos, data));
}
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos, &ctx).await?;
assert_eq!(actual, expected);
}
// Test a large blob that spans multiple pages
let mut large_data = vec![0; 20000];
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data, &ctx).await?;
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
assert_eq!(result, large_data);
Ok(())
}
#[tokio::test]
async fn ephemeral_file_holds_gate_open() {
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
@@ -315,151 +256,4 @@ mod tests {
.expect("closing completes right away")
.expect("closing does not panic");
}
#[tokio::test]
async fn test_ephemeral_file_basics() {
let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
let gate = utils::sync::gate::Gate::default();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let cap = file.buffered_writer.inspect_buffer().capacity();
let write_nbytes = cap + cap / 2;
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(write_nbytes)
.collect();
let mut value_offsets = Vec::new();
for i in 0..write_nbytes {
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
value_offsets.push(off);
}
assert!(file.len() as usize == write_nbytes);
for i in 0..write_nbytes {
assert_eq!(value_offsets[i], i.into_u64());
let buf = Vec::with_capacity(1);
let (buf_slice, nread) = file
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
.await
.unwrap();
let buf = buf_slice.into_inner();
assert_eq!(nread, 1);
assert_eq!(&buf, &content[i..i + 1]);
}
let file_contents =
std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
assert_eq!(file_contents, &content[0..cap]);
let buffer_contents = file.buffered_writer.inspect_buffer();
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
}
#[tokio::test]
async fn test_flushes_do_happen() {
let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
let gate = utils::sync::gate::Gate::default();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let cap = file.buffered_writer.inspect_buffer().capacity();
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(cap + cap / 2)
.collect();
file.write_raw(&content, &ctx).await.unwrap();
// assert the state is as this test expects it to be
assert_eq!(
&file.load_to_vec(&ctx).await.unwrap(),
&content[0..cap + cap / 2]
);
let md = file
.buffered_writer
.as_inner()
.as_inner()
.path
.metadata()
.unwrap();
assert_eq!(
md.len(),
cap.into_u64(),
"buffered writer does one write if we write 1.5x buffer capacity"
);
assert_eq!(
&file.buffered_writer.inspect_buffer()[0..cap / 2],
&content[cap..cap + cap / 2]
);
}
#[tokio::test]
async fn test_read_split_across_file_and_buffer() {
// This test exercises the logic on the read path that splits the logical read
// into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
//
// This test build on the assertions in test_flushes_do_happen
let (conf, tenant_id, timeline_id, ctx) =
harness("test_read_split_across_file_and_buffer").unwrap();
let gate = utils::sync::gate::Gate::default();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let cap = file.buffered_writer.inspect_buffer().capacity();
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(cap + cap / 2)
.collect();
file.write_raw(&content, &ctx).await.unwrap();
let test_read = |start: usize, len: usize| {
let file = &file;
let ctx = &ctx;
let content = &content;
async move {
let (buf, nread) = file
.read_exact_at_eof_ok(
start.into_u64(),
Vec::with_capacity(len).slice_full(),
ctx,
)
.await
.unwrap();
assert_eq!(nread, len);
assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
}
};
// completely within the file range
assert!(20 < cap, "test assumption");
test_read(10, 10).await;
// border onto edge of file
test_read(cap - 10, 10).await;
// read across file and buffer
test_read(cap - 10, 20).await;
// stay from start of buffer
test_read(cap, 10).await;
// completely within buffer
test_read(cap + 10, 10).await;
}
}

Some files were not shown because too many files have changed in this diff Show More