mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-13 21:30:37 +00:00
Compare commits
19 Commits
hack/fast-
...
split-prox
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
18303e4d68 | ||
|
|
3df6d368e3 | ||
|
|
b62e7c0138 | ||
|
|
a2968c6cf8 | ||
|
|
bae1288671 | ||
|
|
1254d8f56e | ||
|
|
073508493c | ||
|
|
7cb2349296 | ||
|
|
87151f9efd | ||
|
|
96fe084c57 | ||
|
|
20fdf3e19f | ||
|
|
c6b36d8171 | ||
|
|
0e8a848937 | ||
|
|
db4085fe22 | ||
|
|
0d895ba002 | ||
|
|
103f34e954 | ||
|
|
262378e561 | ||
|
|
9f38ab39c6 | ||
|
|
fa92328423 |
@@ -23,30 +23,10 @@ platforms = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[final-excludes]
|
[final-excludes]
|
||||||
workspace-members = [
|
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
# from depending on workspace-hack because most of the dependencies are not used.
|
||||||
# from depending on workspace-hack because most of the dependencies are not used.
|
workspace-members = ["vm_monitor"]
|
||||||
"vm_monitor",
|
|
||||||
# All of these exist in libs and are not usually built independently.
|
|
||||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
|
||||||
"compute_api",
|
|
||||||
"consumption_metrics",
|
|
||||||
"desim",
|
|
||||||
"metrics",
|
|
||||||
"pageserver_api",
|
|
||||||
"postgres_backend",
|
|
||||||
"postgres_connection",
|
|
||||||
"postgres_ffi",
|
|
||||||
"pq_proto",
|
|
||||||
"remote_storage",
|
|
||||||
"safekeeper_api",
|
|
||||||
"tenant_size_model",
|
|
||||||
"tracing-utils",
|
|
||||||
"utils",
|
|
||||||
"wal_craft",
|
|
||||||
"walproposer",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||||
# exact-versions = true
|
# exact-versions = true
|
||||||
|
|||||||
6
.github/ISSUE_TEMPLATE/config.yml
vendored
6
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,6 +0,0 @@
|
|||||||
|
|
||||||
blank_issues_enabled: true
|
|
||||||
contact_links:
|
|
||||||
- name: Feature request
|
|
||||||
url: https://console.neon.tech/app/projects?modal=feedback
|
|
||||||
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
|
|
||||||
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -1,6 +1,7 @@
|
|||||||
self-hosted-runner:
|
self-hosted-runner:
|
||||||
labels:
|
labels:
|
||||||
- arm64
|
- arm64
|
||||||
|
- gen3
|
||||||
- large
|
- large
|
||||||
- large-arm64
|
- large-arm64
|
||||||
- small
|
- small
|
||||||
|
|||||||
15
.github/actions/run-python-test-set/action.yml
vendored
15
.github/actions/run-python-test-set/action.yml
vendored
@@ -43,7 +43,7 @@ inputs:
|
|||||||
pg_version:
|
pg_version:
|
||||||
description: 'Postgres version to use for tests'
|
description: 'Postgres version to use for tests'
|
||||||
required: false
|
required: false
|
||||||
default: 'v16'
|
default: 'v14'
|
||||||
benchmark_durations:
|
benchmark_durations:
|
||||||
description: 'benchmark durations JSON'
|
description: 'benchmark durations JSON'
|
||||||
required: false
|
required: false
|
||||||
@@ -71,7 +71,7 @@ runs:
|
|||||||
if: inputs.build_type != 'remote'
|
if: inputs.build_type != 'remote'
|
||||||
uses: ./.github/actions/download
|
uses: ./.github/actions/download
|
||||||
with:
|
with:
|
||||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||||
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||||
prefix: latest
|
prefix: latest
|
||||||
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
||||||
@@ -83,6 +83,7 @@ runs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Cache poetry deps
|
- name: Cache poetry deps
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v4
|
||||||
@@ -169,8 +170,10 @@ runs:
|
|||||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||||
|
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||||
|
cov_prefix=()
|
||||||
else
|
else
|
||||||
cov_prefix=()
|
cov_prefix=()
|
||||||
fi
|
fi
|
||||||
@@ -211,13 +214,13 @@ runs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Upload compatibility snapshot
|
- name: Upload compatibility snapshot
|
||||||
# Note, that we use `github.base_ref` which is a target branch for a PR
|
if: github.ref_name == 'release'
|
||||||
if: github.event_name == 'pull_request' && github.base_ref == 'release'
|
|
||||||
uses: ./.github/actions/upload
|
uses: ./.github/actions/upload
|
||||||
with:
|
with:
|
||||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
|
||||||
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
||||||
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
||||||
|
prefix: latest
|
||||||
|
|
||||||
- name: Upload test results
|
- name: Upload test results
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
|
|||||||
36
.github/actions/set-docker-config-dir/action.yml
vendored
36
.github/actions/set-docker-config-dir/action.yml
vendored
@@ -1,36 +0,0 @@
|
|||||||
name: "Set custom docker config directory"
|
|
||||||
description: "Create a directory for docker config and set DOCKER_CONFIG"
|
|
||||||
|
|
||||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
|
||||||
runs:
|
|
||||||
using: "composite"
|
|
||||||
steps:
|
|
||||||
- name: Show warning on GitHub-hosted runners
|
|
||||||
if: runner.environment == 'github-hosted'
|
|
||||||
shell: bash -euo pipefail {0}
|
|
||||||
run: |
|
|
||||||
# Using the following environment variables to find a path to the workflow file
|
|
||||||
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
|
|
||||||
# ${GITHUB_REPOSITORY} - octocat/hello-world
|
|
||||||
# ${GITHUB_REF} - refs/heads/my_branch
|
|
||||||
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
|
|
||||||
|
|
||||||
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
|
|
||||||
filename=${filename_with_ref%"@$GITHUB_REF"}
|
|
||||||
|
|
||||||
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
|
|
||||||
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
|
|
||||||
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
|
|
||||||
echo "::warning file=${filename},title=${title}::${message}"
|
|
||||||
|
|
||||||
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
|
|
||||||
env:
|
|
||||||
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
|
|
||||||
with:
|
|
||||||
main: |
|
|
||||||
mkdir -p "${DOCKER_CONFIG}"
|
|
||||||
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
|
|
||||||
post: |
|
|
||||||
if [ -d "${DOCKER_CONFIG}" ]; then
|
|
||||||
rm -r "${DOCKER_CONFIG}"
|
|
||||||
fi
|
|
||||||
154
.github/workflows/_benchmarking_preparation.yml
vendored
154
.github/workflows/_benchmarking_preparation.yml
vendored
@@ -1,154 +0,0 @@
|
|||||||
name: Prepare benchmarking databases by restoring dumps
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_call:
|
|
||||||
# no inputs needed
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash -euxo pipefail {0}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
setup-databases:
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
|
||||||
database: [ clickbench, tpch, userexample ]
|
|
||||||
|
|
||||||
env:
|
|
||||||
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
|
|
||||||
PLATFORM: ${{ matrix.platform }}
|
|
||||||
PG_BINARIES: /tmp/neon/pg_install/v16/bin
|
|
||||||
|
|
||||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
|
||||||
container:
|
|
||||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
|
||||||
options: --init
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Set up Connection String
|
|
||||||
id: set-up-prep-connstr
|
|
||||||
run: |
|
|
||||||
case "${PLATFORM}" in
|
|
||||||
neon)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
|
||||||
;;
|
|
||||||
aws-rds-postgres)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
|
||||||
;;
|
|
||||||
aws-aurora-serverless-v2-postgres)
|
|
||||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Download Neon artifact
|
|
||||||
uses: ./.github/actions/download
|
|
||||||
with:
|
|
||||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
|
||||||
path: /tmp/neon/
|
|
||||||
prefix: latest
|
|
||||||
|
|
||||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
|
||||||
- name: Create benchmark_restore_status table if it does not exist
|
|
||||||
env:
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
|
||||||
# we use an advisory lock
|
|
||||||
run: |
|
|
||||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
|
||||||
SELECT pg_advisory_lock(4711);
|
|
||||||
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
|
|
||||||
databasename text primary key,
|
|
||||||
restore_done boolean
|
|
||||||
);
|
|
||||||
SELECT pg_advisory_unlock(4711);
|
|
||||||
"
|
|
||||||
|
|
||||||
- name: Check if restore is already done
|
|
||||||
id: check-restore-done
|
|
||||||
env:
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
run: |
|
|
||||||
skip=false
|
|
||||||
if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
|
|
||||||
echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
|
|
||||||
skip=true
|
|
||||||
fi
|
|
||||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Check and create database if it does not exist
|
|
||||||
if: steps.check-restore-done.outputs.skip != 'true'
|
|
||||||
env:
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
run: |
|
|
||||||
DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
|
|
||||||
if [ "$DB_EXISTS" != "1" ]; then
|
|
||||||
echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
|
|
||||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
|
|
||||||
else
|
|
||||||
echo "Database ${{ env.DATABASE_NAME }} already exists."
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Download dump from S3 to /tmp/dumps
|
|
||||||
if: steps.check-restore-done.outputs.skip != 'true'
|
|
||||||
env:
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
run: |
|
|
||||||
mkdir -p /tmp/dumps
|
|
||||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
|
||||||
|
|
||||||
- name: Replace database name in connection string
|
|
||||||
if: steps.check-restore-done.outputs.skip != 'true'
|
|
||||||
id: replace-dbname
|
|
||||||
env:
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
|
||||||
run: |
|
|
||||||
# Extract the part before the database name
|
|
||||||
base_connstr="${BENCHMARK_CONNSTR%/*}"
|
|
||||||
# Extract the query parameters (if any) after the database name
|
|
||||||
query_params="${BENCHMARK_CONNSTR#*\?}"
|
|
||||||
# Reconstruct the new connection string
|
|
||||||
if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
|
|
||||||
new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
|
|
||||||
else
|
|
||||||
new_connstr="${base_connstr}/${DATABASE_NAME}"
|
|
||||||
fi
|
|
||||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Restore dump
|
|
||||||
if: steps.check-restore-done.outputs.skip != 'true'
|
|
||||||
env:
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
|
|
||||||
# the following works only with larger computes:
|
|
||||||
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
|
|
||||||
# we add the || true because:
|
|
||||||
# the dumps were created with Neon and contain neon extensions that are not
|
|
||||||
# available in RDS, so we will always report an error, but we can ignore it
|
|
||||||
run: |
|
|
||||||
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
|
|
||||||
-d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
|
|
||||||
|
|
||||||
- name: Update benchmark_restore_status table
|
|
||||||
if: steps.check-restore-done.outputs.skip != 'true'
|
|
||||||
env:
|
|
||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
|
||||||
DATABASE_NAME: ${{ matrix.database }}
|
|
||||||
run: |
|
|
||||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
|
||||||
INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
|
|
||||||
ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
|
|
||||||
"
|
|
||||||
27
.github/workflows/_build-and-test-locally.yml
vendored
27
.github/workflows/_build-and-test-locally.yml
vendored
@@ -70,6 +70,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Set pg 14 revision for caching
|
- name: Set pg 14 revision for caching
|
||||||
id: pg_v14_rev
|
id: pg_v14_rev
|
||||||
@@ -94,16 +95,11 @@ jobs:
|
|||||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||||
# corresponding Cargo.toml files for their descriptions.
|
# corresponding Cargo.toml files for their descriptions.
|
||||||
- name: Set env variables
|
- name: Set env variables
|
||||||
env:
|
|
||||||
ARCH: ${{ inputs.arch }}
|
|
||||||
run: |
|
run: |
|
||||||
CARGO_FEATURES="--features testing"
|
CARGO_FEATURES="--features testing"
|
||||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||||
CARGO_FLAGS="--locked"
|
CARGO_FLAGS="--locked"
|
||||||
elif [[ $BUILD_TYPE == "debug" ]]; then
|
|
||||||
cov_prefix=""
|
|
||||||
CARGO_FLAGS="--locked"
|
|
||||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||||
cov_prefix=""
|
cov_prefix=""
|
||||||
CARGO_FLAGS="--locked --release"
|
CARGO_FLAGS="--locked --release"
|
||||||
@@ -163,8 +159,6 @@ jobs:
|
|||||||
# Do install *before* running rust tests because they might recompile the
|
# Do install *before* running rust tests because they might recompile the
|
||||||
# binaries with different features/flags.
|
# binaries with different features/flags.
|
||||||
- name: Install rust binaries
|
- name: Install rust binaries
|
||||||
env:
|
|
||||||
ARCH: ${{ inputs.arch }}
|
|
||||||
run: |
|
run: |
|
||||||
# Install target binaries
|
# Install target binaries
|
||||||
mkdir -p /tmp/neon/bin/
|
mkdir -p /tmp/neon/bin/
|
||||||
@@ -179,7 +173,7 @@ jobs:
|
|||||||
done
|
done
|
||||||
|
|
||||||
# Install test executables and write list of all binaries (for code coverage)
|
# Install test executables and write list of all binaries (for code coverage)
|
||||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||||
# Keep bloated coverage data files away from the rest of the artifact
|
# Keep bloated coverage data files away from the rest of the artifact
|
||||||
mkdir -p /tmp/coverage/
|
mkdir -p /tmp/coverage/
|
||||||
|
|
||||||
@@ -214,16 +208,10 @@ jobs:
|
|||||||
export LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH
|
||||||
|
|
||||||
#nextest does not yet support running doctests
|
#nextest does not yet support running doctests
|
||||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||||
|
|
||||||
# run all non-pageserver tests
|
|
||||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
|
||||||
|
|
||||||
# run pageserver tests with different settings
|
|
||||||
for io_engine in std-fs tokio-epoll-uring ; do
|
for io_engine in std-fs tokio-epoll-uring ; do
|
||||||
for io_buffer_alignment in 0 1 512 ; do
|
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# Run separate tests for real S3
|
# Run separate tests for real S3
|
||||||
@@ -256,8 +244,8 @@ jobs:
|
|||||||
uses: ./.github/actions/save-coverage-data
|
uses: ./.github/actions/save-coverage-data
|
||||||
|
|
||||||
regress-tests:
|
regress-tests:
|
||||||
# Don't run regression tests on debug arm64 builds
|
# Run test on x64 only
|
||||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
if: inputs.arch == 'x64'
|
||||||
needs: [ build-neon ]
|
needs: [ build-neon ]
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
container:
|
container:
|
||||||
@@ -275,6 +263,7 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Pytest regression tests
|
- name: Pytest regression tests
|
||||||
uses: ./.github/actions/run-python-test-set
|
uses: ./.github/actions/run-python-test-set
|
||||||
|
|||||||
2
.github/workflows/actionlint.yml
vendored
2
.github/workflows/actionlint.yml
vendored
@@ -44,7 +44,7 @@ jobs:
|
|||||||
grep -ERl $PAT .github/workflows |\
|
grep -ERl $PAT .github/workflows |\
|
||||||
while read -r f
|
while read -r f
|
||||||
do
|
do
|
||||||
l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
|
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||||
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
|
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
|
||||||
done
|
done
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
86
.github/workflows/benchmarking.yml
vendored
86
.github/workflows/benchmarking.yml
vendored
@@ -96,7 +96,7 @@ jobs:
|
|||||||
uses: aws-actions/configure-aws-credentials@v4
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
with:
|
with:
|
||||||
aws-region: eu-central-1
|
aws-region: eu-central-1
|
||||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
role-duration-seconds: 18000 # 5 hours
|
role-duration-seconds: 18000 # 5 hours
|
||||||
|
|
||||||
- name: Download Neon artifact
|
- name: Download Neon artifact
|
||||||
@@ -146,7 +146,6 @@ jobs:
|
|||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -155,10 +154,7 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic perf testing: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -180,7 +176,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
|
||||||
- name: Download Neon artifact
|
- name: Download Neon artifact
|
||||||
uses: ./.github/actions/download
|
uses: ./.github/actions/download
|
||||||
with:
|
with:
|
||||||
@@ -219,23 +215,15 @@ jobs:
|
|||||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
with:
|
|
||||||
store-test-results-into-db: true
|
|
||||||
env:
|
|
||||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
|
||||||
|
|
||||||
- name: Post to a Slack channel
|
- name: Post to a Slack channel
|
||||||
if: ${{ github.event.schedule && failure() }}
|
if: ${{ github.event.schedule && failure() }}
|
||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic replication testing: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -292,9 +280,8 @@ jobs:
|
|||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
|
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
|
||||||
}'
|
}'
|
||||||
|
|
||||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
if [ "$(date +%A)" = "Saturday" ]; then
|
||||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
|
||||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
@@ -334,13 +321,9 @@ jobs:
|
|||||||
|
|
||||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
prepare_AWS_RDS_databases:
|
|
||||||
uses: ./.github/workflows/_benchmarking_preparation.yml
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
pgbench-compare:
|
pgbench-compare:
|
||||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||||
needs: [ generate-matrices, prepare_AWS_RDS_databases ]
|
needs: [ generate-matrices ]
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
statuses: write
|
statuses: write
|
||||||
@@ -377,7 +360,7 @@ jobs:
|
|||||||
aws-region: eu-central-1
|
aws-region: eu-central-1
|
||||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||||
role-duration-seconds: 18000 # 5 hours
|
role-duration-seconds: 18000 # 5 hours
|
||||||
|
|
||||||
- name: Download Neon artifact
|
- name: Download Neon artifact
|
||||||
uses: ./.github/actions/download
|
uses: ./.github/actions/download
|
||||||
with:
|
with:
|
||||||
@@ -472,7 +455,6 @@ jobs:
|
|||||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -481,10 +463,7 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -558,7 +537,7 @@ jobs:
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
|
||||||
uses: aws-actions/configure-aws-credentials@v4
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
with:
|
with:
|
||||||
@@ -593,9 +572,8 @@ jobs:
|
|||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -604,10 +582,7 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -620,7 +595,7 @@ jobs:
|
|||||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||||
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
|
needs: [ generate-matrices, pgbench-compare ]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -628,7 +603,7 @@ jobs:
|
|||||||
|
|
||||||
env:
|
env:
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
DEFAULT_PG_VERSION: 16
|
DEFAULT_PG_VERSION: 14
|
||||||
TEST_OUTPUT: /tmp/test_output
|
TEST_OUTPUT: /tmp/test_output
|
||||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||||
@@ -680,7 +655,6 @@ jobs:
|
|||||||
run_in_parallel: false
|
run_in_parallel: false
|
||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
|
||||||
env:
|
env:
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -690,7 +664,6 @@ jobs:
|
|||||||
TEST_OLAP_SCALE: 10
|
TEST_OLAP_SCALE: 10
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -699,10 +672,7 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -714,7 +684,7 @@ jobs:
|
|||||||
#
|
#
|
||||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||||
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
|
needs: [ generate-matrices, clickbench-compare ]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -722,7 +692,7 @@ jobs:
|
|||||||
|
|
||||||
env:
|
env:
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
DEFAULT_PG_VERSION: 16
|
DEFAULT_PG_VERSION: 14
|
||||||
TEST_OUTPUT: /tmp/test_output
|
TEST_OUTPUT: /tmp/test_output
|
||||||
BUILD_TYPE: remote
|
BUILD_TYPE: remote
|
||||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||||
@@ -754,7 +724,7 @@ jobs:
|
|||||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||||
;;
|
;;
|
||||||
rds-postgres)
|
rds-postgres)
|
||||||
ENV_PLATFORM=RDS_POSTGRES_TPCH
|
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||||
@@ -780,7 +750,6 @@ jobs:
|
|||||||
run_in_parallel: false
|
run_in_parallel: false
|
||||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||||
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
|
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
|
||||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
|
||||||
env:
|
env:
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -788,7 +757,6 @@ jobs:
|
|||||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -797,16 +765,13 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
user-examples-compare:
|
user-examples-compare:
|
||||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||||
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
|
needs: [ generate-matrices, tpch-compare ]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -814,7 +779,7 @@ jobs:
|
|||||||
|
|
||||||
env:
|
env:
|
||||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||||
DEFAULT_PG_VERSION: 16
|
DEFAULT_PG_VERSION: 14
|
||||||
TEST_OUTPUT: /tmp/test_output
|
TEST_OUTPUT: /tmp/test_output
|
||||||
BUILD_TYPE: remote
|
BUILD_TYPE: remote
|
||||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||||
@@ -871,7 +836,6 @@ jobs:
|
|||||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||||
|
|
||||||
- name: Create Allure report
|
- name: Create Allure report
|
||||||
id: create-allure-report
|
|
||||||
if: ${{ !cancelled() }}
|
if: ${{ !cancelled() }}
|
||||||
uses: ./.github/actions/allure-report-generate
|
uses: ./.github/actions/allure-report-generate
|
||||||
|
|
||||||
@@ -880,10 +844,6 @@ jobs:
|
|||||||
uses: slackapi/slack-github-action@v1
|
uses: slackapi/slack-github-action@v1
|
||||||
with:
|
with:
|
||||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||||
slack-message: |
|
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
|
||||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
|
||||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|||||||
15
.github/workflows/build-build-tools-image.yml
vendored
15
.github/workflows/build-build-tools-image.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
env:
|
env:
|
||||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||||
@@ -56,7 +56,13 @@ jobs:
|
|||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- uses: ./.github/actions/set-docker-config-dir
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/.docker-custom
|
||||||
|
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||||
|
|
||||||
- uses: docker/setup-buildx-action@v3
|
- uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
cache-binary: false
|
cache-binary: false
|
||||||
@@ -83,6 +89,11 @@ jobs:
|
|||||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
||||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf /tmp/.docker-custom
|
||||||
|
|
||||||
merge-images:
|
merge-images:
|
||||||
needs: [ build-image ]
|
needs: [ build-image ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|||||||
199
.github/workflows/build_and_test.yml
vendored
199
.github/workflows/build_and_test.yml
vendored
@@ -48,7 +48,7 @@ jobs:
|
|||||||
|
|
||||||
tag:
|
tag:
|
||||||
needs: [ check-permissions ]
|
needs: [ check-permissions ]
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
outputs:
|
outputs:
|
||||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||||
@@ -90,7 +90,7 @@ jobs:
|
|||||||
|
|
||||||
check-codestyle-python:
|
check-codestyle-python:
|
||||||
needs: [ check-permissions, build-build-tools-image ]
|
needs: [ check-permissions, build-build-tools-image ]
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -101,6 +101,9 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: false
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Cache poetry deps
|
- name: Cache poetry deps
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v4
|
||||||
@@ -139,6 +142,7 @@ jobs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
# Disabled for now
|
# Disabled for now
|
||||||
# - name: Restore cargo deps cache
|
# - name: Restore cargo deps cache
|
||||||
@@ -198,9 +202,9 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64 ]
|
||||||
# Do not build or run tests in debug for release branches
|
# Do not build or run tests in debug for release branches
|
||||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
|
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||||
include:
|
include:
|
||||||
- build-type: release
|
- build-type: release
|
||||||
arch: arm64
|
arch: arm64
|
||||||
@@ -220,7 +224,7 @@ jobs:
|
|||||||
outputs:
|
outputs:
|
||||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||||
needs: [ check-permissions, build-build-tools-image ]
|
needs: [ check-permissions, build-build-tools-image ]
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -253,7 +257,7 @@ jobs:
|
|||||||
benchmarks:
|
benchmarks:
|
||||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||||
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
|
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -280,7 +284,6 @@ jobs:
|
|||||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||||
pg_version: v16
|
|
||||||
env:
|
env:
|
||||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||||
@@ -299,8 +302,9 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
channel-id: C060CNA47S9 # on-call-staging-storage-stream
|
||||||
slack-message: |
|
slack-message: |
|
||||||
Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
|
Benchmarks failed on main: ${{ github.event.head_commit.url }}
|
||||||
<${{ needs.create-test-report.outputs.report-url }}|Allure report>
|
|
||||||
|
Allure report: ${{ needs.create-test-report.outputs.report-url }}
|
||||||
env:
|
env:
|
||||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||||
|
|
||||||
@@ -310,7 +314,7 @@ jobs:
|
|||||||
outputs:
|
outputs:
|
||||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||||
|
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -357,7 +361,7 @@ jobs:
|
|||||||
|
|
||||||
coverage-report:
|
coverage-report:
|
||||||
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||||
credentials:
|
credentials:
|
||||||
@@ -471,7 +475,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -480,7 +484,12 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: ./.github/actions/set-docker-config-dir
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p .docker-custom
|
||||||
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v3
|
- uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
cache-binary: false
|
cache-binary: false
|
||||||
@@ -499,10 +508,7 @@ jobs:
|
|||||||
- uses: docker/build-push-action@v6
|
- uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
# ARM-specific flags are recommended for Graviton ≥ 2, these flags are also supported by Ampere Altra (Azure)
|
|
||||||
# https://github.com/aws/aws-graviton-getting-started/blob/57dc813626d0266f1cc12ef83474745bb1f31fb4/rust.md
|
|
||||||
build-args: |
|
build-args: |
|
||||||
ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
|
|
||||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
|
||||||
@@ -515,6 +521,11 @@ jobs:
|
|||||||
tags: |
|
tags: |
|
||||||
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf .docker-custom
|
||||||
|
|
||||||
neon-image:
|
neon-image:
|
||||||
needs: [ neon-image-arch, tag ]
|
needs: [ neon-image-arch, tag ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
@@ -550,7 +561,7 @@ jobs:
|
|||||||
version: [ v14, v15, v16 ]
|
version: [ v14, v15, v16 ]
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -559,7 +570,12 @@ jobs:
|
|||||||
submodules: true
|
submodules: true
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: ./.github/actions/set-docker-config-dir
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p .docker-custom
|
||||||
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
- uses: docker/setup-buildx-action@v3
|
- uses: docker/setup-buildx-action@v3
|
||||||
with:
|
with:
|
||||||
cache-binary: false
|
cache-binary: false
|
||||||
@@ -642,6 +658,11 @@ jobs:
|
|||||||
tags: |
|
tags: |
|
||||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf .docker-custom
|
||||||
|
|
||||||
compute-node-image:
|
compute-node-image:
|
||||||
needs: [ compute-node-image-arch, tag ]
|
needs: [ compute-node-image-arch, tag ]
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
@@ -695,7 +716,7 @@ jobs:
|
|||||||
|
|
||||||
vm-compute-node-image:
|
vm-compute-node-image:
|
||||||
needs: [ check-permissions, tag, compute-node-image ]
|
needs: [ check-permissions, tag, compute-node-image ]
|
||||||
runs-on: [ self-hosted, large ]
|
runs-on: [ self-hosted, gen3, large ]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -714,7 +735,13 @@ jobs:
|
|||||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||||
chmod +x vm-builder
|
chmod +x vm-builder
|
||||||
|
|
||||||
- uses: ./.github/actions/set-docker-config-dir
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p .docker-custom
|
||||||
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
|
|
||||||
- uses: docker/login-action@v3
|
- uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
@@ -737,6 +764,11 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf .docker-custom
|
||||||
|
|
||||||
test-images:
|
test-images:
|
||||||
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
needs: [ check-permissions, tag, neon-image, compute-node-image ]
|
||||||
strategy:
|
strategy:
|
||||||
@@ -744,7 +776,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
arch: [ x64, arm64 ]
|
arch: [ x64, arm64 ]
|
||||||
|
|
||||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
@@ -752,7 +784,13 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- uses: ./.github/actions/set-docker-config-dir
|
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||||
|
# The default value is ~/.docker
|
||||||
|
- name: Set custom docker config directory
|
||||||
|
run: |
|
||||||
|
mkdir -p .docker-custom
|
||||||
|
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||||
|
|
||||||
- uses: docker/login-action@v3
|
- uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||||
@@ -792,6 +830,11 @@ jobs:
|
|||||||
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
||||||
docker compose -f ./docker-compose/docker-compose.yml down
|
docker compose -f ./docker-compose/docker-compose.yml down
|
||||||
|
|
||||||
|
- name: Remove custom docker config directory
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -rf .docker-custom
|
||||||
|
|
||||||
promote-images:
|
promote-images:
|
||||||
permissions:
|
permissions:
|
||||||
contents: read # This is required for actions/checkout
|
contents: read # This is required for actions/checkout
|
||||||
@@ -959,7 +1002,7 @@ jobs:
|
|||||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
|
||||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||||
|
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||||
steps:
|
steps:
|
||||||
- name: Fix git ownership
|
- name: Fix git ownership
|
||||||
@@ -979,6 +1022,7 @@ jobs:
|
|||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
submodules: false
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Trigger deploy workflow
|
- name: Trigger deploy workflow
|
||||||
@@ -986,10 +1030,10 @@ jobs:
|
|||||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||||
-f deployPgSniRouter=false \
|
-f deployPgSniRouter=false \
|
||||||
-f deployProxy=false \
|
-f deployProxy=false \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
@@ -999,14 +1043,14 @@ jobs:
|
|||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
|
|
||||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||||
-f deployStorage=true \
|
-f deployStorage=true \
|
||||||
-f deployStorageBroker=true \
|
-f deployStorageBroker=true \
|
||||||
-f deployStorageController=true \
|
-f deployStorageController=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||||
-f deployPgSniRouter=true \
|
-f deployPgSniRouter=true \
|
||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
-f deployStorage=false \
|
-f deployStorage=false \
|
||||||
@@ -1016,7 +1060,7 @@ jobs:
|
|||||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||||
-f deployPreprodRegion=true
|
-f deployPreprodRegion=true
|
||||||
|
|
||||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||||
-f deployPgSniRouter=true \
|
-f deployPgSniRouter=true \
|
||||||
-f deployProxy=true \
|
-f deployProxy=true \
|
||||||
-f branch=main \
|
-f branch=main \
|
||||||
@@ -1055,88 +1099,43 @@ jobs:
|
|||||||
generate_release_notes: true,
|
generate_release_notes: true,
|
||||||
})
|
})
|
||||||
|
|
||||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
|
||||||
promote-compatibility-data:
|
promote-compatibility-data:
|
||||||
needs: [ deploy ]
|
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
|
||||||
if: github.ref_name == 'release'
|
if: github.ref_name == 'release'
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
|
container:
|
||||||
|
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||||
|
options: --init
|
||||||
steps:
|
steps:
|
||||||
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
|
- name: Promote compatibility snapshot for the release
|
||||||
id: fetch-last-release-pr-info
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
run: |
|
|
||||||
branch_name_and_pr_number=$(gh pr list \
|
|
||||||
--repo "${GITHUB_REPOSITORY}" \
|
|
||||||
--base release \
|
|
||||||
--state merged \
|
|
||||||
--limit 10 \
|
|
||||||
--json mergeCommit,headRefName,number \
|
|
||||||
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
|
|
||||||
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
|
|
||||||
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
|
|
||||||
|
|
||||||
run_id=$(gh run list \
|
|
||||||
--repo "${GITHUB_REPOSITORY}" \
|
|
||||||
--workflow build_and_test.yml \
|
|
||||||
--branch "${branch_name}" \
|
|
||||||
--json databaseId \
|
|
||||||
--limit 1 \
|
|
||||||
--jq '.[].databaseId')
|
|
||||||
|
|
||||||
last_commit_sha=$(gh pr view "${pr_number}" \
|
|
||||||
--repo "${GITHUB_REPOSITORY}" \
|
|
||||||
--json commits \
|
|
||||||
--jq '.commits[-1].oid')
|
|
||||||
|
|
||||||
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
|
|
||||||
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
|
|
||||||
|
|
||||||
- name: Promote compatibility snapshot and Neon artifact
|
|
||||||
env:
|
env:
|
||||||
BUCKET: neon-github-public-dev
|
BUCKET: neon-github-public-dev
|
||||||
AWS_REGION: eu-central-1
|
PREFIX: artifacts/latest
|
||||||
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
|
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||||
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
|
|
||||||
run: |
|
run: |
|
||||||
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
|
# Update compatibility snapshot for the release
|
||||||
new_prefix="artifacts/latest"
|
for pg_version in v14 v15 v16; do
|
||||||
|
|
||||||
files_to_promote=()
|
|
||||||
files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
|
|
||||||
|
|
||||||
for arch in X64 ARM64; do
|
|
||||||
for build_type in debug release; do
|
for build_type in debug release; do
|
||||||
neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
|
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
|
||||||
s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
|
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
|
||||||
if [ -z "${s3_key}" ]; then
|
|
||||||
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
||||||
|
|
||||||
for pg_version in v14 v15 v16; do
|
|
||||||
# We run less tests for debug builds, so we don't need to promote them
|
|
||||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
|
|
||||||
s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
|
|
||||||
if [ -z "${s3_key}" ]; then
|
|
||||||
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
|
||||||
done
|
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
for f in "${files_to_promote[@]}"; do
|
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||||
time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
|
for build_type in debug release; do
|
||||||
|
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
|
||||||
|
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||||
|
|
||||||
|
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||||
|
if [ -z "${S3_KEY}" ]; then
|
||||||
|
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||||
done
|
done
|
||||||
|
|
||||||
pin-build-tools-image:
|
pin-build-tools-image:
|
||||||
@@ -1160,12 +1159,10 @@ jobs:
|
|||||||
# Format `needs` differently to make the list more readable.
|
# Format `needs` differently to make the list more readable.
|
||||||
# Usually we do `needs: [...]`
|
# Usually we do `needs: [...]`
|
||||||
needs:
|
needs:
|
||||||
- build-and-test-locally
|
|
||||||
- check-codestyle-python
|
- check-codestyle-python
|
||||||
- check-codestyle-rust
|
- check-codestyle-rust
|
||||||
- promote-images
|
- build-and-test-locally
|
||||||
- test-images
|
- test-images
|
||||||
- trigger-custom-extensions-build-and-wait
|
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
# The list of possible results:
|
# The list of possible results:
|
||||||
|
|||||||
54
.github/workflows/label-for-external-users.yml
vendored
54
.github/workflows/label-for-external-users.yml
vendored
@@ -1,54 +0,0 @@
|
|||||||
name: Add `external` label to issues and PRs created by external users
|
|
||||||
|
|
||||||
on:
|
|
||||||
issues:
|
|
||||||
types:
|
|
||||||
- opened
|
|
||||||
pull_request_target:
|
|
||||||
types:
|
|
||||||
- opened
|
|
||||||
|
|
||||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
|
||||||
permissions: {}
|
|
||||||
|
|
||||||
env:
|
|
||||||
LABEL: external
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
check-user:
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
|
|
||||||
outputs:
|
|
||||||
is-member: ${{ steps.check-user.outputs.is-member }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
|
|
||||||
id: check-user
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
|
||||||
run: |
|
|
||||||
if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
|
|
||||||
is_member=true
|
|
||||||
else
|
|
||||||
is_member=false
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
|
||||||
|
|
||||||
add-label:
|
|
||||||
if: needs.check-user.outputs.is-member == 'false'
|
|
||||||
needs: [ check-user ]
|
|
||||||
|
|
||||||
runs-on: ubuntu-22.04
|
|
||||||
permissions:
|
|
||||||
pull-requests: write # for `gh pr edit`
|
|
||||||
issues: write # for `gh issue edit`
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Add `${{ env.LABEL }}` label
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
|
|
||||||
GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
|
|
||||||
run: |
|
|
||||||
gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
|
|
||||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -56,6 +56,7 @@ jobs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
- name: Install macOS postgres dependencies
|
- name: Install macOS postgres dependencies
|
||||||
run: brew install flex bison openssl protobuf icu4c pkg-config
|
run: brew install flex bison openssl protobuf icu4c pkg-config
|
||||||
@@ -157,6 +158,7 @@ jobs:
|
|||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
fetch-depth: 1
|
||||||
|
|
||||||
# Some of our rust modules use FFI and need those to be checked
|
# Some of our rust modules use FFI and need those to be checked
|
||||||
- name: Get postgres headers
|
- name: Get postgres headers
|
||||||
|
|||||||
2
.github/workflows/periodic_pagebench.yml
vendored
2
.github/workflows/periodic_pagebench.yml
vendored
@@ -27,7 +27,7 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||||
runs-on: [ self-hosted, small ]
|
runs-on: [ self-hosted, gen3, small ]
|
||||||
container:
|
container:
|
||||||
image: neondatabase/build-tools:pinned
|
image: neondatabase/build-tools:pinned
|
||||||
credentials:
|
credentials:
|
||||||
|
|||||||
143
Cargo.lock
generated
143
Cargo.lock
generated
@@ -936,12 +936,6 @@ dependencies = [
|
|||||||
"which",
|
"which",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "bit_field"
|
|
||||||
version = "0.10.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "1.3.2"
|
version = "1.3.2"
|
||||||
@@ -1214,6 +1208,7 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1326,6 +1321,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1333,6 +1329,7 @@ name = "control_plane"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-trait",
|
||||||
"camino",
|
"camino",
|
||||||
"clap",
|
"clap",
|
||||||
"comfy-table",
|
"comfy-table",
|
||||||
@@ -1673,13 +1670,14 @@ dependencies = [
|
|||||||
"smallvec",
|
"smallvec",
|
||||||
"tracing",
|
"tracing",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "diesel"
|
name = "diesel"
|
||||||
version = "2.2.3"
|
version = "2.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
|
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -2949,6 +2947,17 @@ version = "1.3.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "leaky-bucket"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
||||||
|
dependencies = [
|
||||||
|
"parking_lot 0.12.1",
|
||||||
|
"tokio",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.150"
|
version = "0.2.150"
|
||||||
@@ -3138,6 +3147,7 @@ dependencies = [
|
|||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"rand_distr",
|
"rand_distr",
|
||||||
"twox-hash",
|
"twox-hash",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3677,7 +3687,6 @@ dependencies = [
|
|||||||
"async-compression",
|
"async-compression",
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bit_field",
|
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
"camino",
|
"camino",
|
||||||
@@ -3702,6 +3711,7 @@ dependencies = [
|
|||||||
"humantime-serde",
|
"humantime-serde",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
|
"leaky-bucket",
|
||||||
"md5",
|
"md5",
|
||||||
"metrics",
|
"metrics",
|
||||||
"nix 0.27.1",
|
"nix 0.27.1",
|
||||||
@@ -3726,7 +3736,6 @@ dependencies = [
|
|||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
"rpds",
|
"rpds",
|
||||||
"scopeguard",
|
"scopeguard",
|
||||||
"send-future",
|
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_path_to_error",
|
"serde_path_to_error",
|
||||||
@@ -3782,6 +3791,7 @@ dependencies = [
|
|||||||
"strum_macros",
|
"strum_macros",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3789,6 +3799,7 @@ name = "pageserver_client"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures",
|
"futures",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -4005,6 +4016,29 @@ dependencies = [
|
|||||||
"indexmap 1.9.3",
|
"indexmap 1.9.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pg_sni_router"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"clap",
|
||||||
|
"futures",
|
||||||
|
"git-version",
|
||||||
|
"itertools 0.10.5",
|
||||||
|
"pq_proto",
|
||||||
|
"proxy-core",
|
||||||
|
"proxy-sasl",
|
||||||
|
"rustls 0.22.4",
|
||||||
|
"rustls-pemfile 2.1.1",
|
||||||
|
"socket2 0.5.5",
|
||||||
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
|
"tracing",
|
||||||
|
"tracing-utils",
|
||||||
|
"utils",
|
||||||
|
"uuid",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf"
|
name = "phf"
|
||||||
version = "0.11.1"
|
version = "0.11.1"
|
||||||
@@ -4182,6 +4216,7 @@ dependencies = [
|
|||||||
"tokio-rustls 0.25.0",
|
"tokio-rustls 0.25.0",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4194,6 +4229,7 @@ dependencies = [
|
|||||||
"postgres",
|
"postgres",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"url",
|
"url",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4216,6 +4252,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4253,6 +4290,7 @@ dependencies = [
|
|||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -4398,6 +4436,34 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "proxy"
|
name = "proxy"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"aws-config",
|
||||||
|
"clap",
|
||||||
|
"futures",
|
||||||
|
"git-version",
|
||||||
|
"humantime",
|
||||||
|
"itertools 0.10.5",
|
||||||
|
"metrics",
|
||||||
|
"pq_proto",
|
||||||
|
"proxy-core",
|
||||||
|
"proxy-sasl",
|
||||||
|
"remote_storage",
|
||||||
|
"rustls 0.22.4",
|
||||||
|
"rustls-pemfile 2.1.1",
|
||||||
|
"socket2 0.5.5",
|
||||||
|
"tikv-jemallocator",
|
||||||
|
"tokio",
|
||||||
|
"tokio-util",
|
||||||
|
"tracing",
|
||||||
|
"tracing-utils",
|
||||||
|
"utils",
|
||||||
|
"uuid",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proxy-core"
|
||||||
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ahash",
|
"ahash",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@@ -4424,7 +4490,6 @@ dependencies = [
|
|||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
"framed-websockets",
|
"framed-websockets",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
|
||||||
"hashbrown 0.14.5",
|
"hashbrown 0.14.5",
|
||||||
"hashlink",
|
"hashlink",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -4447,7 +4512,6 @@ dependencies = [
|
|||||||
"measured",
|
"measured",
|
||||||
"metrics",
|
"metrics",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"opentelemetry",
|
|
||||||
"p256 0.13.2",
|
"p256 0.13.2",
|
||||||
"parking_lot 0.12.1",
|
"parking_lot 0.12.1",
|
||||||
"parquet",
|
"parquet",
|
||||||
@@ -4457,7 +4521,7 @@ dependencies = [
|
|||||||
"postgres-protocol",
|
"postgres-protocol",
|
||||||
"postgres_backend",
|
"postgres_backend",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
"prometheus",
|
"proxy-sasl",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"rand_distr",
|
"rand_distr",
|
||||||
"rcgen",
|
"rcgen",
|
||||||
@@ -4487,7 +4551,6 @@ dependencies = [
|
|||||||
"task-local-extensions",
|
"task-local-extensions",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tikv-jemalloc-ctl",
|
"tikv-jemalloc-ctl",
|
||||||
"tikv-jemallocator",
|
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
"tokio-postgres-rustls",
|
"tokio-postgres-rustls",
|
||||||
@@ -4510,6 +4573,35 @@ dependencies = [
|
|||||||
"x509-parser",
|
"x509-parser",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proxy-sasl"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"anyhow",
|
||||||
|
"base64 0.13.1",
|
||||||
|
"bytes",
|
||||||
|
"crossbeam-deque",
|
||||||
|
"hmac",
|
||||||
|
"itertools 0.10.5",
|
||||||
|
"lasso",
|
||||||
|
"measured",
|
||||||
|
"parking_lot 0.12.1",
|
||||||
|
"pbkdf2",
|
||||||
|
"postgres-protocol",
|
||||||
|
"pq_proto",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"rustls 0.22.4",
|
||||||
|
"sha2",
|
||||||
|
"subtle",
|
||||||
|
"thiserror",
|
||||||
|
"tokio",
|
||||||
|
"tracing",
|
||||||
|
"uuid",
|
||||||
|
"workspace_hack",
|
||||||
|
"x509-parser",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quick-xml"
|
name = "quick-xml"
|
||||||
version = "0.31.0"
|
version = "0.31.0"
|
||||||
@@ -4817,6 +4909,7 @@ dependencies = [
|
|||||||
"toml_edit 0.19.10",
|
"toml_edit 0.19.10",
|
||||||
"tracing",
|
"tracing",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5341,6 +5434,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5449,12 +5543,6 @@ version = "1.0.17"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "send-future"
|
|
||||||
version = "0.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sentry"
|
name = "sentry"
|
||||||
version = "0.32.3"
|
version = "0.32.3"
|
||||||
@@ -5590,12 +5678,11 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.125"
|
version = "1.0.96"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
|
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"itoa",
|
"itoa",
|
||||||
"memchr",
|
|
||||||
"ryu",
|
"ryu",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
@@ -5950,6 +6037,7 @@ name = "storage_controller_client"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"async-trait",
|
||||||
"bytes",
|
"bytes",
|
||||||
"futures",
|
"futures",
|
||||||
"pageserver_api",
|
"pageserver_api",
|
||||||
@@ -6182,6 +6270,7 @@ dependencies = [
|
|||||||
"anyhow",
|
"anyhow",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6782,6 +6871,7 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
"tracing-opentelemetry",
|
"tracing-opentelemetry",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -6952,6 +7042,7 @@ dependencies = [
|
|||||||
"anyhow",
|
"anyhow",
|
||||||
"arc-swap",
|
"arc-swap",
|
||||||
"async-compression",
|
"async-compression",
|
||||||
|
"async-trait",
|
||||||
"bincode",
|
"bincode",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -6967,6 +7058,7 @@ dependencies = [
|
|||||||
"humantime",
|
"humantime",
|
||||||
"hyper 0.14.26",
|
"hyper 0.14.26",
|
||||||
"jsonwebtoken",
|
"jsonwebtoken",
|
||||||
|
"leaky-bucket",
|
||||||
"metrics",
|
"metrics",
|
||||||
"nix 0.27.1",
|
"nix 0.27.1",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@@ -6997,6 +7089,7 @@ dependencies = [
|
|||||||
"url",
|
"url",
|
||||||
"uuid",
|
"uuid",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -7075,6 +7168,7 @@ dependencies = [
|
|||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"regex",
|
"regex",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -7095,6 +7189,7 @@ dependencies = [
|
|||||||
"bindgen",
|
"bindgen",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"utils",
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -7651,6 +7746,8 @@ dependencies = [
|
|||||||
"tokio",
|
"tokio",
|
||||||
"tokio-rustls 0.24.0",
|
"tokio-rustls 0.24.0",
|
||||||
"tokio-util",
|
"tokio-util",
|
||||||
|
"toml_datetime",
|
||||||
|
"toml_edit 0.19.10",
|
||||||
"tonic",
|
"tonic",
|
||||||
"tower",
|
"tower",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
|||||||
10
Cargo.toml
10
Cargo.toml
@@ -9,7 +9,10 @@ members = [
|
|||||||
"pageserver/ctl",
|
"pageserver/ctl",
|
||||||
"pageserver/client",
|
"pageserver/client",
|
||||||
"pageserver/pagebench",
|
"pageserver/pagebench",
|
||||||
"proxy",
|
"proxy/core",
|
||||||
|
"proxy/sasl",
|
||||||
|
"proxy/proxy",
|
||||||
|
"proxy/pg_sni_router",
|
||||||
"safekeeper",
|
"safekeeper",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
"storage_controller",
|
"storage_controller",
|
||||||
@@ -65,7 +68,6 @@ axum = { version = "0.6.20", features = ["ws"] }
|
|||||||
base64 = "0.13.0"
|
base64 = "0.13.0"
|
||||||
bincode = "1.3"
|
bincode = "1.3"
|
||||||
bindgen = "0.65"
|
bindgen = "0.65"
|
||||||
bit_field = "0.10.2"
|
|
||||||
bstr = "1.0"
|
bstr = "1.0"
|
||||||
byteorder = "1.4"
|
byteorder = "1.4"
|
||||||
bytes = "1.0"
|
bytes = "1.0"
|
||||||
@@ -108,12 +110,13 @@ ipnet = "2.9.0"
|
|||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
jsonwebtoken = "9"
|
jsonwebtoken = "9"
|
||||||
lasso = "0.7"
|
lasso = "0.7"
|
||||||
|
leaky-bucket = "1.0.1"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
measured = { version = "0.0.22", features=["lasso"] }
|
measured = { version = "0.0.22", features=["lasso"] }
|
||||||
measured-process = { version = "0.0.22" }
|
measured-process = { version = "0.0.22" }
|
||||||
memoffset = "0.8"
|
memoffset = "0.8"
|
||||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||||
notify = "6.0.0"
|
notify = "6.0.0"
|
||||||
num_cpus = "1.15"
|
num_cpus = "1.15"
|
||||||
num-traits = "0.2.15"
|
num-traits = "0.2.15"
|
||||||
@@ -145,7 +148,6 @@ rustls-split = "0.3"
|
|||||||
scopeguard = "1.1"
|
scopeguard = "1.1"
|
||||||
sysinfo = "0.29.2"
|
sysinfo = "0.29.2"
|
||||||
sd-notify = "0.4.1"
|
sd-notify = "0.4.1"
|
||||||
send-future = "0.1.0"
|
|
||||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
|||||||
@@ -35,9 +35,8 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
|
|||||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||||
COPY --chown=nonroot . .
|
COPY --chown=nonroot . .
|
||||||
|
|
||||||
ARG ADDITIONAL_RUSTFLAGS
|
|
||||||
RUN set -e \
|
RUN set -e \
|
||||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
&& PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||||
--bin pg_sni_router \
|
--bin pg_sni_router \
|
||||||
--bin pageserver \
|
--bin pageserver \
|
||||||
--bin pagectl \
|
--bin pagectl \
|
||||||
|
|||||||
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
|||||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||||
COPY patches/pg_hint_plan.patch /ext-src
|
COPY patches/pg_hintplan.patch /ext-src
|
||||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||||
COPY patches/pg_cron.patch /ext-src
|
COPY patches/pg_cron.patch /ext-src
|
||||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||||
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
|||||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||||
# cmake is required for the h3 test
|
# cmake is required for the h3 test
|
||||||
RUN apt-get update && apt-get install -y cmake
|
RUN apt-get update && apt-get install -y cmake
|
||||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||||
RUN patch -p1 </ext-src/pg_anon.patch
|
RUN patch -p1 </ext-src/pg_anon.patch
|
||||||
RUN patch -p1 </ext-src/pg_cron.patch
|
RUN patch -p1 </ext-src/pg_cron.patch
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
|||||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||||
|
|
||||||
To run the integration tests or Python scripts (not required to use the code), install
|
To run the integration tests or Python scripts (not required to use the code), install
|
||||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||||
|
|
||||||
|
|
||||||
#### Running neon database
|
#### Running neon database
|
||||||
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
|||||||
testing locally, it is convenient to run just one set of permutations, like this:
|
testing locally, it is convenient to run just one set of permutations, like this:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
## Flamegraphs
|
## Flamegraphs
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ use std::{thread, time::Duration};
|
|||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
|
||||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||||
use tracing::{error, info, warn};
|
use tracing::{error, info, warn};
|
||||||
@@ -367,8 +366,6 @@ fn wait_spec(
|
|||||||
state.start_time = now;
|
state.start_time = now;
|
||||||
}
|
}
|
||||||
|
|
||||||
launch_lsn_lease_bg_task_for_static(&compute);
|
|
||||||
|
|
||||||
Ok(WaitSpecResult {
|
Ok(WaitSpecResult {
|
||||||
compute,
|
compute,
|
||||||
http_port,
|
http_port,
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ pub mod logger;
|
|||||||
pub mod catalog;
|
pub mod catalog;
|
||||||
pub mod compute;
|
pub mod compute;
|
||||||
pub mod extension_server;
|
pub mod extension_server;
|
||||||
pub mod lsn_lease;
|
|
||||||
mod migration;
|
mod migration;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod params;
|
pub mod params;
|
||||||
|
|||||||
@@ -1,186 +0,0 @@
|
|||||||
use anyhow::bail;
|
|
||||||
use anyhow::Result;
|
|
||||||
use postgres::{NoTls, SimpleQueryMessage};
|
|
||||||
use std::time::SystemTime;
|
|
||||||
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
|
||||||
use utils::id::TenantId;
|
|
||||||
use utils::id::TimelineId;
|
|
||||||
|
|
||||||
use compute_api::spec::ComputeMode;
|
|
||||||
use tracing::{info, warn};
|
|
||||||
use utils::{
|
|
||||||
lsn::Lsn,
|
|
||||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
|
||||||
};
|
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
|
||||||
|
|
||||||
/// Spawns a background thread to periodically renew LSN leases for static compute.
|
|
||||||
/// Do nothing if the compute is not in static mode.
|
|
||||||
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
|
|
||||||
let (tenant_id, timeline_id, lsn) = {
|
|
||||||
let state = compute.state.lock().unwrap();
|
|
||||||
let spec = state.pspec.as_ref().expect("Spec must be set");
|
|
||||||
match spec.spec.mode {
|
|
||||||
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
|
|
||||||
_ => return,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
let compute = compute.clone();
|
|
||||||
|
|
||||||
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
|
|
||||||
thread::spawn(move || {
|
|
||||||
let _entered = span.entered();
|
|
||||||
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
|
|
||||||
// TODO: might need stronger error feedback than logging an warning.
|
|
||||||
warn!("Exited with error: {e}");
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Renews lsn lease periodically so static compute are not affected by GC.
|
|
||||||
fn lsn_lease_bg_task(
|
|
||||||
compute: Arc<ComputeNode>,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
lsn: Lsn,
|
|
||||||
) -> Result<()> {
|
|
||||||
loop {
|
|
||||||
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
|
|
||||||
let valid_duration = valid_until
|
|
||||||
.duration_since(SystemTime::now())
|
|
||||||
.unwrap_or(Duration::ZERO);
|
|
||||||
|
|
||||||
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
|
|
||||||
let sleep_duration = valid_duration
|
|
||||||
.saturating_sub(Duration::from_secs(60))
|
|
||||||
.max(valid_duration / 2);
|
|
||||||
|
|
||||||
info!(
|
|
||||||
"Succeeded, sleeping for {} seconds",
|
|
||||||
sleep_duration.as_secs()
|
|
||||||
);
|
|
||||||
thread::sleep(sleep_duration);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
|
|
||||||
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
|
|
||||||
fn acquire_lsn_lease_with_retry(
|
|
||||||
compute: &Arc<ComputeNode>,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
lsn: Lsn,
|
|
||||||
) -> Result<SystemTime> {
|
|
||||||
let mut attempts = 0usize;
|
|
||||||
let mut retry_period_ms: f64 = 500.0;
|
|
||||||
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
|
||||||
let configs = {
|
|
||||||
let state = compute.state.lock().unwrap();
|
|
||||||
|
|
||||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
|
||||||
|
|
||||||
let conn_strings = spec.pageserver_connstr.split(',');
|
|
||||||
|
|
||||||
conn_strings
|
|
||||||
.map(|connstr| {
|
|
||||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
|
||||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
|
||||||
info!("Got storage auth token from spec file");
|
|
||||||
config.password(storage_auth_token.clone());
|
|
||||||
} else {
|
|
||||||
info!("Storage auth token not set");
|
|
||||||
}
|
|
||||||
config
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
|
||||||
match result {
|
|
||||||
Ok(Some(res)) => {
|
|
||||||
return Ok(res);
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
|
||||||
|
|
||||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
|
||||||
retry_period_ms *= 1.5;
|
|
||||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
attempts += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tries to acquire an LSN lease through PS page_service API.
|
|
||||||
fn try_acquire_lsn_lease(
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
lsn: Lsn,
|
|
||||||
configs: &[postgres::Config],
|
|
||||||
) -> Result<Option<SystemTime>> {
|
|
||||||
fn get_valid_until(
|
|
||||||
config: &postgres::Config,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
lsn: Lsn,
|
|
||||||
) -> Result<Option<SystemTime>> {
|
|
||||||
let mut client = config.connect(NoTls)?;
|
|
||||||
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
|
|
||||||
let res = client.simple_query(&cmd)?;
|
|
||||||
let msg = match res.first() {
|
|
||||||
Some(msg) => msg,
|
|
||||||
None => bail!("empty response"),
|
|
||||||
};
|
|
||||||
let row = match msg {
|
|
||||||
SimpleQueryMessage::Row(row) => row,
|
|
||||||
_ => bail!("error parsing lsn lease response"),
|
|
||||||
};
|
|
||||||
|
|
||||||
// Note: this will be None if a lease is explicitly not granted.
|
|
||||||
let valid_until_str = row.get("valid_until");
|
|
||||||
|
|
||||||
let valid_until = valid_until_str.map(|s| {
|
|
||||||
SystemTime::UNIX_EPOCH
|
|
||||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
|
||||||
.expect("Time larger than max SystemTime could handle")
|
|
||||||
});
|
|
||||||
Ok(valid_until)
|
|
||||||
}
|
|
||||||
|
|
||||||
let shard_count = configs.len();
|
|
||||||
|
|
||||||
let valid_until = if shard_count > 1 {
|
|
||||||
configs
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(shard_number, config)| {
|
|
||||||
let tenant_shard_id = TenantShardId {
|
|
||||||
tenant_id,
|
|
||||||
shard_count: ShardCount::new(shard_count as u8),
|
|
||||||
shard_number: ShardNumber(shard_number as u8),
|
|
||||||
};
|
|
||||||
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
|
||||||
})
|
|
||||||
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
|
||||||
.into_iter()
|
|
||||||
.min()
|
|
||||||
.unwrap()
|
|
||||||
} else {
|
|
||||||
get_valid_until(
|
|
||||||
&configs[0],
|
|
||||||
TenantShardId::unsharded(tenant_id),
|
|
||||||
timeline_id,
|
|
||||||
lsn,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(valid_until)
|
|
||||||
}
|
|
||||||
@@ -6,6 +6,7 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
|
async-trait.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
comfy-table.workspace = true
|
comfy-table.workspace = true
|
||||||
|
|||||||
@@ -379,7 +379,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||||
match kill(pid, None) {
|
match kill(pid, None) {
|
||||||
// Process exists, keep waiting
|
// Process exists, keep waiting
|
||||||
Ok(_) => Ok(false),
|
Ok(_) => Ok(false),
|
||||||
|
|||||||
@@ -15,9 +15,7 @@ use control_plane::local_env::{
|
|||||||
};
|
};
|
||||||
use control_plane::pageserver::PageServerNode;
|
use control_plane::pageserver::PageServerNode;
|
||||||
use control_plane::safekeeper::SafekeeperNode;
|
use control_plane::safekeeper::SafekeeperNode;
|
||||||
use control_plane::storage_controller::{
|
use control_plane::storage_controller::StorageController;
|
||||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
|
||||||
};
|
|
||||||
use control_plane::{broker, local_env};
|
use control_plane::{broker, local_env};
|
||||||
use pageserver_api::config::{
|
use pageserver_api::config::{
|
||||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||||
@@ -54,7 +52,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
|||||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
|
|
||||||
const DEFAULT_PG_VERSION: &str = "16";
|
const DEFAULT_PG_VERSION: &str = "15";
|
||||||
|
|
||||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||||
|
|
||||||
@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
|||||||
humantime_duration.as_ref()
|
humantime_duration.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
|
|
||||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
|
||||||
|
|
||||||
let base_port = args.get_one::<u16>("base-port");
|
|
||||||
|
|
||||||
if maybe_instance_id.is_some() && base_port.is_none() {
|
|
||||||
panic!("storage-controller start specificied instance-id but did not provide base-port");
|
|
||||||
}
|
|
||||||
|
|
||||||
let start_timeout = args
|
|
||||||
.get_one::<humantime::Duration>("start-timeout")
|
|
||||||
.expect("invalid value for start-timeout");
|
|
||||||
|
|
||||||
NeonStorageControllerStartArgs {
|
|
||||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
|
||||||
base_port: base_port.copied(),
|
|
||||||
start_timeout: *start_timeout,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
|
|
||||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
|
||||||
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
|
||||||
|
|
||||||
NeonStorageControllerStopArgs {
|
|
||||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
|
||||||
immediate,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", subcommand_args)) => {
|
Some(("start", subcommand_args)) => {
|
||||||
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
|
|||||||
let svc = StorageController::from_env(env);
|
let svc = StorageController::from_env(env);
|
||||||
match sub_match.subcommand() {
|
match sub_match.subcommand() {
|
||||||
Some(("start", start_match)) => {
|
Some(("start", start_match)) => {
|
||||||
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
|
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||||
eprintln!("start failed: {e}");
|
eprintln!("start failed: {e}");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(("stop", stop_match)) => {
|
Some(("stop", stop_match)) => {
|
||||||
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
|
let immediate = stop_match
|
||||||
|
.get_one::<String>("stop-mode")
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
== Some("immediate");
|
||||||
|
|
||||||
|
if let Err(e) = svc.stop(immediate).await {
|
||||||
eprintln!("stop failed: {}", e);
|
eprintln!("stop failed: {}", e);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
|
|||||||
// Only start the storage controller if the pageserver is configured to need it
|
// Only start the storage controller if the pageserver is configured to need it
|
||||||
if env.control_plane_api.is_some() {
|
if env.control_plane_api.is_some() {
|
||||||
let storage_controller = StorageController::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
if let Err(e) = storage_controller
|
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
|
||||||
(*retry_timeout).into(),
|
|
||||||
))
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
eprintln!("storage_controller start failed: {:#}", e);
|
eprintln!("storage_controller start failed: {:#}", e);
|
||||||
try_stop_all(env, true).await;
|
try_stop_all(env, true).await;
|
||||||
exit(1);
|
exit(1);
|
||||||
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
|||||||
eprintln!("neon broker stop failed: {e:#}");
|
eprintln!("neon broker stop failed: {e:#}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stop all storage controller instances. In the most common case there's only one,
|
if env.control_plane_api.is_some() {
|
||||||
// but iterate though the base data directory in order to discover the instances.
|
|
||||||
let storcon_instances = env
|
|
||||||
.storage_controller_instances()
|
|
||||||
.await
|
|
||||||
.expect("Must inspect data dir");
|
|
||||||
for (instance_id, _instance_dir_path) in storcon_instances {
|
|
||||||
let storage_controller = StorageController::from_env(env);
|
let storage_controller = StorageController::from_env(env);
|
||||||
let stop_args = NeonStorageControllerStopArgs {
|
if let Err(e) = storage_controller.stop(immediate).await {
|
||||||
instance_id,
|
eprintln!("storage controller stop failed: {e:#}");
|
||||||
immediate,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Err(e) = storage_controller.stop(stop_args).await {
|
|
||||||
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
|
|||||||
.action(ArgAction::SetTrue)
|
.action(ArgAction::SetTrue)
|
||||||
.required(false);
|
.required(false);
|
||||||
|
|
||||||
let instance_id = Arg::new("instance-id")
|
|
||||||
.long("instance-id")
|
|
||||||
.help("Identifier used to distinguish storage controller instances (default 1)")
|
|
||||||
.value_parser(value_parser!(u8))
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
let base_port = Arg::new("base-port")
|
|
||||||
.long("base-port")
|
|
||||||
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
|
|
||||||
.value_parser(value_parser!(u16))
|
|
||||||
.required(false);
|
|
||||||
|
|
||||||
Command::new("Neon CLI")
|
Command::new("Neon CLI")
|
||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.version(GIT_VERSION)
|
.version(GIT_VERSION)
|
||||||
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
|
|||||||
.arg_required_else_help(true)
|
.arg_required_else_help(true)
|
||||||
.about("Manage storage_controller")
|
.about("Manage storage_controller")
|
||||||
.subcommand(Command::new("start").about("Start storage controller")
|
.subcommand(Command::new("start").about("Start storage controller")
|
||||||
.arg(timeout_arg.clone())
|
.arg(timeout_arg.clone()))
|
||||||
.arg(instance_id.clone())
|
|
||||||
.arg(base_port))
|
|
||||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||||
.arg(stop_mode_arg.clone())
|
.arg(stop_mode_arg.clone()))
|
||||||
.arg(instance_id))
|
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("safekeeper")
|
Command::new("safekeeper")
|
||||||
|
|||||||
@@ -824,12 +824,11 @@ impl Endpoint {
|
|||||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||||
// etc.
|
// etc.
|
||||||
//
|
//
|
||||||
// If destroying or stop mode is immediate, send it SIGTERM before
|
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||||
// waiting. Sometimes we do *not* want this cleanup: tests intentionally
|
// want this cleanup: tests intentionally do stop when majority of
|
||||||
// do stop when majority of safekeepers is down, so sync-safekeepers
|
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||||
// would hang otherwise. This could be a separate flag though.
|
// could be a separate flag though.
|
||||||
let send_sigterm = destroy || mode == "immediate";
|
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||||
self.wait_for_compute_ctl_to_exit(send_sigterm)?;
|
|
||||||
if destroy {
|
if destroy {
|
||||||
println!(
|
println!(
|
||||||
"Destroying postgres data directory '{}'",
|
"Destroying postgres data directory '{}'",
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
|||||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||||
use crate::safekeeper::SafekeeperNode;
|
use crate::safekeeper::SafekeeperNode;
|
||||||
|
|
||||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||||
|
|
||||||
//
|
//
|
||||||
// This data structures represents neon_local CLI config
|
// This data structures represents neon_local CLI config
|
||||||
@@ -156,11 +156,6 @@ pub struct NeonStorageControllerConf {
|
|||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
pub max_warming_up: Duration,
|
pub max_warming_up: Duration,
|
||||||
|
|
||||||
pub start_as_candidate: bool,
|
|
||||||
|
|
||||||
/// Database url used when running multiple storage controller instances
|
|
||||||
pub database_url: Option<SocketAddr>,
|
|
||||||
|
|
||||||
/// Threshold for auto-splitting a tenant into shards
|
/// Threshold for auto-splitting a tenant into shards
|
||||||
pub split_threshold: Option<u64>,
|
pub split_threshold: Option<u64>,
|
||||||
|
|
||||||
@@ -179,8 +174,6 @@ impl Default for NeonStorageControllerConf {
|
|||||||
Self {
|
Self {
|
||||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||||
start_as_candidate: false,
|
|
||||||
database_url: None,
|
|
||||||
split_threshold: None,
|
split_threshold: None,
|
||||||
max_secondary_lag_bytes: None,
|
max_secondary_lag_bytes: None,
|
||||||
}
|
}
|
||||||
@@ -399,36 +392,6 @@ impl LocalEnv {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
|
||||||
/// for all storage controller instances
|
|
||||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
|
||||||
let mut instances = Vec::default();
|
|
||||||
|
|
||||||
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
|
|
||||||
for dentry in dir {
|
|
||||||
let dentry = dentry?;
|
|
||||||
let is_dir = dentry.metadata()?.is_dir();
|
|
||||||
let filename = dentry.file_name().into_string().unwrap();
|
|
||||||
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
|
|
||||||
Some(suffix) => suffix.parse::<u8>().ok(),
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
let is_instance_dir = is_dir && parsed_instance_id.is_some();
|
|
||||||
|
|
||||||
if !is_instance_dir {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
instances.push((
|
|
||||||
parsed_instance_id.expect("Checked previously"),
|
|
||||||
dentry.path(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(instances)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn register_branch_mapping(
|
pub fn register_branch_mapping(
|
||||||
&mut self,
|
&mut self,
|
||||||
branch_name: String,
|
branch_name: String,
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
//! ```text
|
//! ```text
|
||||||
//! .neon/safekeepers/<safekeeper id>
|
//! .neon/safekeepers/<safekeeper id>
|
||||||
//! ```
|
//! ```
|
||||||
use std::future::Future;
|
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -35,10 +34,12 @@ pub enum SafekeeperHttpError {
|
|||||||
|
|
||||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||||
|
|
||||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
#[async_trait::async_trait]
|
||||||
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
pub trait ResponseErrorMessageExt: Sized {
|
||||||
|
async fn error_from_body(self) -> Result<Self>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
impl ResponseErrorMessageExt for reqwest::Response {
|
impl ResponseErrorMessageExt for reqwest::Response {
|
||||||
async fn error_from_body(self) -> Result<Self> {
|
async fn error_from_body(self) -> Result<Self> {
|
||||||
let status = self.status();
|
let status = self.status();
|
||||||
|
|||||||
@@ -3,8 +3,6 @@ use crate::{
|
|||||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||||
};
|
};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use hyper::Uri;
|
|
||||||
use nix::unistd::Pid;
|
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
controller_api::{
|
controller_api::{
|
||||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||||
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
|||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use reqwest::Method;
|
use reqwest::Method;
|
||||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
use std::{fs, str::FromStr, time::Duration};
|
||||||
use tokio::process::Command;
|
use tokio::process::Command;
|
||||||
use tracing::instrument;
|
use tracing::instrument;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
@@ -31,14 +29,12 @@ use utils::{
|
|||||||
|
|
||||||
pub struct StorageController {
|
pub struct StorageController {
|
||||||
env: LocalEnv,
|
env: LocalEnv,
|
||||||
|
listen: String,
|
||||||
private_key: Option<Vec<u8>>,
|
private_key: Option<Vec<u8>>,
|
||||||
public_key: Option<String>,
|
public_key: Option<String>,
|
||||||
|
postgres_port: u16,
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
config: NeonStorageControllerConf,
|
config: NeonStorageControllerConf,
|
||||||
|
|
||||||
// The listen addresses is learned when starting the storage controller,
|
|
||||||
// hence the use of OnceLock to init it at the right time.
|
|
||||||
listen: OnceLock<SocketAddr>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const COMMAND: &str = "storage_controller";
|
const COMMAND: &str = "storage_controller";
|
||||||
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
|||||||
|
|
||||||
const DB_NAME: &str = "storage_controller";
|
const DB_NAME: &str = "storage_controller";
|
||||||
|
|
||||||
pub struct NeonStorageControllerStartArgs {
|
|
||||||
pub instance_id: u8,
|
|
||||||
pub base_port: Option<u16>,
|
|
||||||
pub start_timeout: humantime::Duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NeonStorageControllerStartArgs {
|
|
||||||
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
|
|
||||||
Self {
|
|
||||||
instance_id: 1,
|
|
||||||
base_port: None,
|
|
||||||
start_timeout,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct NeonStorageControllerStopArgs {
|
|
||||||
pub instance_id: u8,
|
|
||||||
pub immediate: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NeonStorageControllerStopArgs {
|
|
||||||
pub fn with_default_instance_id(immediate: bool) -> Self {
|
|
||||||
Self {
|
|
||||||
instance_id: 1,
|
|
||||||
immediate,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
pub struct AttachHookRequest {
|
pub struct AttachHookRequest {
|
||||||
pub tenant_shard_id: TenantShardId,
|
pub tenant_shard_id: TenantShardId,
|
||||||
@@ -101,6 +67,23 @@ pub struct InspectResponse {
|
|||||||
|
|
||||||
impl StorageController {
|
impl StorageController {
|
||||||
pub fn from_env(env: &LocalEnv) -> Self {
|
pub fn from_env(env: &LocalEnv) -> Self {
|
||||||
|
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||||
|
// pageservers have control plane API set
|
||||||
|
let listen_url = env.control_plane_api.clone().unwrap();
|
||||||
|
|
||||||
|
let listen = format!(
|
||||||
|
"{}:{}",
|
||||||
|
listen_url.host_str().unwrap(),
|
||||||
|
listen_url.port().unwrap()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||||
|
// port, for use by our captive postgres.
|
||||||
|
let postgres_port = listen_url
|
||||||
|
.port()
|
||||||
|
.expect("Control plane API setting should always have a port")
|
||||||
|
+ 1;
|
||||||
|
|
||||||
// Assume all pageservers have symmetric auth configuration: this service
|
// Assume all pageservers have symmetric auth configuration: this service
|
||||||
// expects to use one JWT token to talk to all of them.
|
// expects to use one JWT token to talk to all of them.
|
||||||
let ps_conf = env
|
let ps_conf = env
|
||||||
@@ -143,28 +126,20 @@ impl StorageController {
|
|||||||
|
|
||||||
Self {
|
Self {
|
||||||
env: env.clone(),
|
env: env.clone(),
|
||||||
|
listen,
|
||||||
private_key,
|
private_key,
|
||||||
public_key,
|
public_key,
|
||||||
|
postgres_port,
|
||||||
client: reqwest::ClientBuilder::new()
|
client: reqwest::ClientBuilder::new()
|
||||||
.build()
|
.build()
|
||||||
.expect("Failed to construct http client"),
|
.expect("Failed to construct http client"),
|
||||||
config: env.storage_controller.clone(),
|
config: env.storage_controller.clone(),
|
||||||
listen: OnceLock::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
|
fn pid_file(&self) -> Utf8PathBuf {
|
||||||
self.env
|
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||||
.base_data_dir
|
.expect("non-Unicode path")
|
||||||
.join(format!("storage_controller_{}", instance_id))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
|
|
||||||
Utf8PathBuf::from_path_buf(
|
|
||||||
self.storage_controller_instance_dir(instance_id)
|
|
||||||
.join("storage_controller.pid"),
|
|
||||||
)
|
|
||||||
.expect("non-Unicode path")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// PIDFile for the postgres instance used to store storage controller state
|
/// PIDFile for the postgres instance used to store storage controller state
|
||||||
@@ -209,23 +184,23 @@ impl StorageController {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Readiness check for our postgres process
|
/// Readiness check for our postgres process
|
||||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
|
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||||
let bin_path = pg_bin_dir.join("pg_isready");
|
let bin_path = pg_bin_dir.join("pg_isready");
|
||||||
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
|
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||||
|
|
||||||
Ok(exitcode.success())
|
Ok(exitcode.success())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create our database if it doesn't exist
|
/// Create our database if it doesn't exist, and run migrations.
|
||||||
///
|
///
|
||||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||||
///
|
///
|
||||||
/// Returns the database url
|
/// Returns the database url
|
||||||
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
|
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||||
|
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
let createdb_path = pg_bin_dir.join("createdb");
|
let createdb_path = pg_bin_dir.join("createdb");
|
||||||
@@ -234,7 +209,7 @@ impl StorageController {
|
|||||||
"-h",
|
"-h",
|
||||||
"localhost",
|
"localhost",
|
||||||
"-p",
|
"-p",
|
||||||
&format!("{}", postgres_port),
|
&format!("{}", self.postgres_port),
|
||||||
DB_NAME,
|
DB_NAME,
|
||||||
])
|
])
|
||||||
.output()
|
.output()
|
||||||
@@ -255,14 +230,13 @@ impl StorageController {
|
|||||||
|
|
||||||
pub async fn connect_to_database(
|
pub async fn connect_to_database(
|
||||||
&self,
|
&self,
|
||||||
postgres_port: u16,
|
|
||||||
) -> anyhow::Result<(
|
) -> anyhow::Result<(
|
||||||
tokio_postgres::Client,
|
tokio_postgres::Client,
|
||||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||||
)> {
|
)> {
|
||||||
tokio_postgres::Config::new()
|
tokio_postgres::Config::new()
|
||||||
.host("localhost")
|
.host("localhost")
|
||||||
.port(postgres_port)
|
.port(self.postgres_port)
|
||||||
// The user is the ambient operating system user name.
|
// The user is the ambient operating system user name.
|
||||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||||
//
|
//
|
||||||
@@ -278,114 +252,72 @@ impl StorageController {
|
|||||||
.map_err(anyhow::Error::new)
|
.map_err(anyhow::Error::new)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||||
if err.kind() != std::io::ErrorKind::AlreadyExists {
|
.unwrap()
|
||||||
panic!("Failed to create instance dir {instance_dir:?}");
|
.join("storage_controller_db");
|
||||||
}
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
}
|
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||||
|
let pg_log_path = pg_data_path.join("postgres.log");
|
||||||
|
|
||||||
let (listen, postgres_port) = {
|
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||||
if let Some(base_port) = start_args.base_port {
|
// Initialize empty database
|
||||||
(
|
let initdb_path = pg_bin_dir.join("initdb");
|
||||||
format!("127.0.0.1:{base_port}"),
|
let mut child = Command::new(&initdb_path)
|
||||||
self.config
|
.envs(vec![
|
||||||
.database_url
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
.expect("--base-port requires NeonStorageControllerConf::database_url")
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
.port(),
|
])
|
||||||
)
|
.args(["-D", pg_data_path.as_ref()])
|
||||||
} else {
|
.spawn()
|
||||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
.expect("Failed to spawn initdb");
|
||||||
|
let status = child.wait().await?;
|
||||||
let listen = format!(
|
if !status.success() {
|
||||||
"{}:{}",
|
anyhow::bail!("initdb failed with status {status}");
|
||||||
listen_url.host_str().unwrap(),
|
|
||||||
listen_url.port().unwrap()
|
|
||||||
);
|
|
||||||
|
|
||||||
(listen, listen_url.port().unwrap() + 1)
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let socket_addr = listen
|
// Write a minimal config file:
|
||||||
.parse()
|
// - Specify the port, since this is chosen dynamically
|
||||||
.expect("listen address is a valid socket address");
|
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||||
self.listen
|
// the storage controller we don't want a slow local disk to interfere with that.
|
||||||
.set(socket_addr)
|
//
|
||||||
.expect("StorageController::listen is only set here");
|
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||||
|
// from `LocalEnv`'s config file (`.neon/config`).
|
||||||
|
tokio::fs::write(
|
||||||
|
&pg_data_path.join("postgresql.conf"),
|
||||||
|
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
// Do we remove the pid file on stop?
|
println!("Starting storage controller database...");
|
||||||
let pg_started = self.is_postgres_running().await?;
|
let db_start_args = [
|
||||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
"-w",
|
||||||
|
"-D",
|
||||||
|
pg_data_path.as_ref(),
|
||||||
|
"-l",
|
||||||
|
pg_log_path.as_ref(),
|
||||||
|
"start",
|
||||||
|
];
|
||||||
|
|
||||||
if !pg_started {
|
background_process::start_process(
|
||||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
"storage_controller_db",
|
||||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
&self.env.base_data_dir,
|
||||||
.unwrap()
|
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||||
.join("storage_controller_db");
|
db_start_args,
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
vec![
|
||||||
let pg_log_path = pg_data_path.join("postgres.log");
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
|
],
|
||||||
|
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||||
|
retry_timeout,
|
||||||
|
|| self.pg_isready(&pg_bin_dir),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
// Run migrations on every startup, in case something changed.
|
||||||
// Initialize empty database
|
let database_url = self.setup_database().await?;
|
||||||
let initdb_path = pg_bin_dir.join("initdb");
|
|
||||||
let mut child = Command::new(&initdb_path)
|
|
||||||
.envs(vec![
|
|
||||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
|
||||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
|
||||||
])
|
|
||||||
.args(["-D", pg_data_path.as_ref()])
|
|
||||||
.spawn()
|
|
||||||
.expect("Failed to spawn initdb");
|
|
||||||
let status = child.wait().await?;
|
|
||||||
if !status.success() {
|
|
||||||
anyhow::bail!("initdb failed with status {status}");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Write a minimal config file:
|
|
||||||
// - Specify the port, since this is chosen dynamically
|
|
||||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
|
||||||
// the storage controller we don't want a slow local disk to interfere with that.
|
|
||||||
//
|
|
||||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
|
||||||
// from `LocalEnv`'s config file (`.neon/config`).
|
|
||||||
tokio::fs::write(
|
|
||||||
&pg_data_path.join("postgresql.conf"),
|
|
||||||
format!("port = {}\nfsync=off\n", postgres_port),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("Starting storage controller database...");
|
|
||||||
let db_start_args = [
|
|
||||||
"-w",
|
|
||||||
"-D",
|
|
||||||
pg_data_path.as_ref(),
|
|
||||||
"-l",
|
|
||||||
pg_log_path.as_ref(),
|
|
||||||
"start",
|
|
||||||
];
|
|
||||||
|
|
||||||
background_process::start_process(
|
|
||||||
"storage_controller_db",
|
|
||||||
&self.env.base_data_dir,
|
|
||||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
|
||||||
db_start_args,
|
|
||||||
vec![
|
|
||||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
|
||||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
|
||||||
],
|
|
||||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
|
||||||
&start_args.start_timeout,
|
|
||||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
self.setup_database(postgres_port).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
|
||||||
|
|
||||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||||
// This is used by the test suite.
|
// This is used by the test suite.
|
||||||
@@ -407,7 +339,7 @@ impl StorageController {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
|
let (mut client, conn) = self.connect_to_database().await?;
|
||||||
let conn = tokio::spawn(conn);
|
let conn = tokio::spawn(conn);
|
||||||
let tx = client.build_transaction();
|
let tx = client.build_transaction();
|
||||||
let tx = tx.start().await?;
|
let tx = tx.start().await?;
|
||||||
@@ -416,20 +348,9 @@ impl StorageController {
|
|||||||
drop(client);
|
drop(client);
|
||||||
conn.await??;
|
conn.await??;
|
||||||
|
|
||||||
let listen = self
|
|
||||||
.listen
|
|
||||||
.get()
|
|
||||||
.expect("cell is set earlier in this function");
|
|
||||||
let address_for_peers = Uri::builder()
|
|
||||||
.scheme("http")
|
|
||||||
.authority(format!("{}:{}", listen.ip(), listen.port()))
|
|
||||||
.path_and_query("")
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut args = vec![
|
let mut args = vec![
|
||||||
"-l",
|
"-l",
|
||||||
&listen.to_string(),
|
&self.listen,
|
||||||
"--dev",
|
"--dev",
|
||||||
"--database-url",
|
"--database-url",
|
||||||
&database_url,
|
&database_url,
|
||||||
@@ -437,27 +358,15 @@ impl StorageController {
|
|||||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||||
"--max-warming-up-interval",
|
"--max-warming-up-interval",
|
||||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||||
"--address-for-peers",
|
|
||||||
&address_for_peers.to_string(),
|
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
if self.config.start_as_candidate {
|
|
||||||
args.push("--start-as-candidate".to_string());
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(private_key) = &self.private_key {
|
if let Some(private_key) = &self.private_key {
|
||||||
let claims = Claims::new(None, Scope::PageServerApi);
|
let claims = Claims::new(None, Scope::PageServerApi);
|
||||||
let jwt_token =
|
let jwt_token =
|
||||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||||
args.push(format!("--jwt-token={jwt_token}"));
|
args.push(format!("--jwt-token={jwt_token}"));
|
||||||
|
|
||||||
let peer_claims = Claims::new(None, Scope::Admin);
|
|
||||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
|
||||||
.expect("failed to generate jwt token");
|
|
||||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(public_key) = &self.public_key {
|
if let Some(public_key) = &self.public_key {
|
||||||
@@ -485,15 +394,15 @@ impl StorageController {
|
|||||||
|
|
||||||
background_process::start_process(
|
background_process::start_process(
|
||||||
COMMAND,
|
COMMAND,
|
||||||
&instance_dir,
|
&self.env.base_data_dir,
|
||||||
&self.env.storage_controller_bin(),
|
&self.env.storage_controller_bin(),
|
||||||
args,
|
args,
|
||||||
vec![
|
vec![
|
||||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||||
],
|
],
|
||||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
background_process::InitialPidFile::Create(self.pid_file()),
|
||||||
&start_args.start_timeout,
|
retry_timeout,
|
||||||
|| async {
|
|| async {
|
||||||
match self.ready().await {
|
match self.ready().await {
|
||||||
Ok(_) => Ok(true),
|
Ok(_) => Ok(true),
|
||||||
@@ -506,35 +415,8 @@ impl StorageController {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
|
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||||
background_process::stop_process(
|
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||||
stop_args.immediate,
|
|
||||||
COMMAND,
|
|
||||||
&self.pid_file(stop_args.instance_id),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let storcon_instances = self.env.storage_controller_instances().await?;
|
|
||||||
for (instance_id, instanced_dir_path) in storcon_instances {
|
|
||||||
if instance_id == stop_args.instance_id {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let pid_file = instanced_dir_path.join("storage_controller.pid");
|
|
||||||
let pid = tokio::fs::read_to_string(&pid_file)
|
|
||||||
.await
|
|
||||||
.map_err(|err| {
|
|
||||||
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
|
|
||||||
})?
|
|
||||||
.parse::<i32>()
|
|
||||||
.expect("pid is valid i32");
|
|
||||||
|
|
||||||
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
|
|
||||||
if other_proc_alive {
|
|
||||||
// There is another storage controller instance running, so we return
|
|
||||||
// and leave the database running.
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
@@ -547,51 +429,27 @@ impl StorageController {
|
|||||||
.wait()
|
.wait()
|
||||||
.await?;
|
.await?;
|
||||||
if !stop_status.success() {
|
if !stop_status.success() {
|
||||||
match self.is_postgres_running().await {
|
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||||
Ok(false) => {
|
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||||
println!("Storage controller database is already stopped");
|
.args(pg_status_args)
|
||||||
return Ok(());
|
.spawn()?
|
||||||
}
|
.wait()
|
||||||
Ok(true) => {
|
.await?;
|
||||||
anyhow::bail!("Failed to stop storage controller database");
|
|
||||||
}
|
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||||
Err(err) => {
|
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||||
anyhow::bail!("Failed to stop storage controller database: {err}");
|
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||||
}
|
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||||
|
println!("Storage controller database is already stopped");
|
||||||
|
return Ok(());
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
|
||||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
|
||||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
|
||||||
|
|
||||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
|
||||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
|
||||||
.args(pg_status_args)
|
|
||||||
.spawn()?
|
|
||||||
.wait()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
|
||||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
|
||||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
|
||||||
const PG_NO_DATA_DIR: i32 = 4;
|
|
||||||
const PG_STATUS_RUNNING: i32 = 0;
|
|
||||||
match status_exitcode.code() {
|
|
||||||
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
|
|
||||||
Some(PG_NO_DATA_DIR) => Ok(false),
|
|
||||||
Some(PG_STATUS_RUNNING) => Ok(true),
|
|
||||||
Some(code) => Err(anyhow::anyhow!(
|
|
||||||
"pg_ctl status returned unexpected status code: {:?}",
|
|
||||||
code
|
|
||||||
)),
|
|
||||||
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||||
let category = match path.find('/') {
|
let category = match path.find('/') {
|
||||||
Some(idx) => &path[..idx],
|
Some(idx) => &path[..idx],
|
||||||
@@ -617,31 +475,15 @@ impl StorageController {
|
|||||||
RQ: Serialize + Sized,
|
RQ: Serialize + Sized,
|
||||||
RS: DeserializeOwned + Sized,
|
RS: DeserializeOwned + Sized,
|
||||||
{
|
{
|
||||||
// In the special case of the `storage_controller start` subcommand, we wish
|
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||||
// to use the API endpoint of the newly started storage controller in order
|
// for general purpose API access.
|
||||||
// to pass the readiness check. In this scenario [`Self::listen`] will be set
|
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||||
// (see [`Self::start`]).
|
let url = Url::from_str(&format!(
|
||||||
//
|
"http://{}:{}/{path}",
|
||||||
// Otherwise, we infer the storage controller api endpoint from the configured
|
listen_url.host_str().unwrap(),
|
||||||
// control plane API.
|
listen_url.port().unwrap()
|
||||||
let url = if let Some(socket_addr) = self.listen.get() {
|
))
|
||||||
Url::from_str(&format!(
|
.unwrap();
|
||||||
"http://{}:{}/{path}",
|
|
||||||
socket_addr.ip().to_canonical(),
|
|
||||||
socket_addr.port()
|
|
||||||
))
|
|
||||||
.unwrap()
|
|
||||||
} else {
|
|
||||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
|
||||||
// for general purpose API access.
|
|
||||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
|
||||||
Url::from_str(&format!(
|
|
||||||
"http://{}:{}/{path}",
|
|
||||||
listen_url.host_str().unwrap(),
|
|
||||||
listen_url.port().unwrap()
|
|
||||||
))
|
|
||||||
.unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut builder = self.client.request(method, url);
|
let mut builder = self.client.request(method, url);
|
||||||
if let Some(body) = body {
|
if let Some(body) = body {
|
||||||
|
|||||||
@@ -41,8 +41,6 @@ enum Command {
|
|||||||
listen_http_addr: String,
|
listen_http_addr: String,
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
listen_http_port: u16,
|
listen_http_port: u16,
|
||||||
#[arg(long)]
|
|
||||||
availability_zone_id: String,
|
|
||||||
},
|
},
|
||||||
|
|
||||||
/// Modify a node's configuration in the storage controller
|
/// Modify a node's configuration in the storage controller
|
||||||
@@ -149,9 +147,9 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
threshold: humantime::Duration,
|
threshold: humantime::Duration,
|
||||||
},
|
},
|
||||||
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||||
// outside of the specified set.
|
// outside of the specified set.
|
||||||
BulkMigrate {
|
Drain {
|
||||||
// Set of pageserver node ids to drain.
|
// Set of pageserver node ids to drain.
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
nodes: Vec<NodeId>,
|
nodes: Vec<NodeId>,
|
||||||
@@ -165,34 +163,6 @@ enum Command {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
dry_run: Option<bool>,
|
dry_run: Option<bool>,
|
||||||
},
|
},
|
||||||
/// Start draining the specified pageserver.
|
|
||||||
/// The drain is complete when the schedulling policy returns to active.
|
|
||||||
StartDrain {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
},
|
|
||||||
/// Cancel draining the specified pageserver and wait for `timeout`
|
|
||||||
/// for the operation to be canceled. May be retried.
|
|
||||||
CancelDrain {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
#[arg(long)]
|
|
||||||
timeout: humantime::Duration,
|
|
||||||
},
|
|
||||||
/// Start filling the specified pageserver.
|
|
||||||
/// The drain is complete when the schedulling policy returns to active.
|
|
||||||
StartFill {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
},
|
|
||||||
/// Cancel filling the specified pageserver and wait for `timeout`
|
|
||||||
/// for the operation to be canceled. May be retried.
|
|
||||||
CancelFill {
|
|
||||||
#[arg(long)]
|
|
||||||
node_id: NodeId,
|
|
||||||
#[arg(long)]
|
|
||||||
timeout: humantime::Duration,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
@@ -279,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_scheduling_policy<F>(
|
|
||||||
client: Client,
|
|
||||||
node_id: NodeId,
|
|
||||||
timeout: Duration,
|
|
||||||
f: F,
|
|
||||||
) -> anyhow::Result<NodeSchedulingPolicy>
|
|
||||||
where
|
|
||||||
F: Fn(NodeSchedulingPolicy) -> bool,
|
|
||||||
{
|
|
||||||
let waiter = tokio::time::timeout(timeout, async move {
|
|
||||||
loop {
|
|
||||||
let node = client
|
|
||||||
.dispatch::<(), NodeDescribeResponse>(
|
|
||||||
Method::GET,
|
|
||||||
format!("control/v1/node/{node_id}"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if f(node.scheduling) {
|
|
||||||
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
Ok(waiter.await??)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
@@ -324,7 +266,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
listen_http_addr,
|
listen_http_addr,
|
||||||
listen_http_port,
|
listen_http_port,
|
||||||
availability_zone_id,
|
|
||||||
} => {
|
} => {
|
||||||
storcon_client
|
storcon_client
|
||||||
.dispatch::<_, ()>(
|
.dispatch::<_, ()>(
|
||||||
@@ -336,7 +277,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
listen_pg_port,
|
listen_pg_port,
|
||||||
listen_http_addr,
|
listen_http_addr,
|
||||||
listen_http_port,
|
listen_http_port,
|
||||||
availability_zone_id: Some(availability_zone_id),
|
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
@@ -682,13 +622,12 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
threshold: threshold.into(),
|
threshold: threshold.into(),
|
||||||
},
|
},
|
||||||
)),
|
)),
|
||||||
heatmap_period: Some("300s".to_string()),
|
|
||||||
..Default::default()
|
..Default::default()
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
Command::BulkMigrate {
|
Command::Drain {
|
||||||
nodes,
|
nodes,
|
||||||
concurrency,
|
concurrency,
|
||||||
max_shards,
|
max_shards,
|
||||||
@@ -717,7 +656,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if nodes.len() != node_to_drain_descs.len() {
|
if nodes.len() != node_to_drain_descs.len() {
|
||||||
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||||
}
|
}
|
||||||
|
|
||||||
node_to_fill_descs.retain(|desc| {
|
node_to_fill_descs.retain(|desc| {
|
||||||
@@ -729,7 +668,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if node_to_fill_descs.is_empty() {
|
if node_to_fill_descs.is_empty() {
|
||||||
anyhow::bail!("There are no nodes to migrate to")
|
anyhow::bail!("There are no nodes to drain to")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the node scheduling policy to draining for the nodes which
|
// Set the node scheduling policy to draining for the nodes which
|
||||||
@@ -750,7 +689,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform the migration: move each tenant shard scheduled on a node to
|
// Perform the drain: move each tenant shard scheduled on a node to
|
||||||
// be drained to a node which is being filled. A simple round robin
|
// be drained to a node which is being filled. A simple round robin
|
||||||
// strategy is used to pick the new node.
|
// strategy is used to pick the new node.
|
||||||
let tenants = storcon_client
|
let tenants = storcon_client
|
||||||
@@ -763,13 +702,13 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
|
|
||||||
let mut selected_node_idx = 0;
|
let mut selected_node_idx = 0;
|
||||||
|
|
||||||
struct MigrationMove {
|
struct DrainMove {
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
from: NodeId,
|
from: NodeId,
|
||||||
to: NodeId,
|
to: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut moves: Vec<MigrationMove> = Vec::new();
|
let mut moves: Vec<DrainMove> = Vec::new();
|
||||||
|
|
||||||
let shards = tenants
|
let shards = tenants
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -799,7 +738,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
moves.push(MigrationMove {
|
moves.push(DrainMove {
|
||||||
tenant_shard_id: shard.tenant_shard_id,
|
tenant_shard_id: shard.tenant_shard_id,
|
||||||
from: shard
|
from: shard
|
||||||
.node_attached
|
.node_attached
|
||||||
@@ -876,67 +815,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
failure
|
failure
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Command::StartDrain { node_id } => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<(), ()>(
|
|
||||||
Method::PUT,
|
|
||||||
format!("control/v1/node/{node_id}/drain"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
println!("Drain started for {node_id}");
|
|
||||||
}
|
|
||||||
Command::CancelDrain { node_id, timeout } => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<(), ()>(
|
|
||||||
Method::DELETE,
|
|
||||||
format!("control/v1/node/{node_id}/drain"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
|
||||||
|
|
||||||
let final_policy =
|
|
||||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
|
||||||
use NodeSchedulingPolicy::*;
|
|
||||||
matches!(sched, Active | PauseForRestart)
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
Command::StartFill { node_id } => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("Fill started for {node_id}");
|
|
||||||
}
|
|
||||||
Command::CancelFill { node_id, timeout } => {
|
|
||||||
storcon_client
|
|
||||||
.dispatch::<(), ()>(
|
|
||||||
Method::DELETE,
|
|
||||||
format!("control/v1/node/{node_id}/fill"),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
|
||||||
|
|
||||||
let final_policy =
|
|
||||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
|
||||||
use NodeSchedulingPolicy::*;
|
|
||||||
matches!(sched, Active)
|
|
||||||
})
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
println!(
|
|
||||||
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ set -x
|
|||||||
|
|
||||||
cd /ext-src || exit 2
|
cd /ext-src || exit 2
|
||||||
FAILED=
|
FAILED=
|
||||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||||
for d in ${LIST}
|
for d in ${LIST}
|
||||||
do
|
do
|
||||||
[ -d "${d}" ] || continue
|
[ -d "${d}" ] || continue
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
|
|||||||
during the restart at 2024-04-03 16:37 UTC.
|
during the restart at 2024-04-03 16:37 UTC.
|
||||||
|
|
||||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||||
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||||
|
|
||||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||||
|
|||||||
@@ -1,495 +0,0 @@
|
|||||||
# Safekeeper dynamic membership change
|
|
||||||
|
|
||||||
To quickly recover from safekeeper node failures and do rebalancing we need to
|
|
||||||
be able to change set of safekeepers the timeline resides on. The procedure must
|
|
||||||
be safe (not lose committed log) regardless of safekeepers and compute state. It
|
|
||||||
should be able to progress if any majority of old safekeeper set, any majority
|
|
||||||
of new safekeeper set and compute are up and connected. This is known as a
|
|
||||||
consensus membership change. It always involves two phases: 1) switch old
|
|
||||||
majority to old + new configuration, preventing commits without acknowledge from
|
|
||||||
the new set 2) bootstrap the new set by ensuring majority of the new set has all
|
|
||||||
data which ever could have been committed before the first phase completed;
|
|
||||||
after that switch is safe to finish. Without two phases switch to the new set
|
|
||||||
which quorum might not intersect with quorum of the old set (and typical case of
|
|
||||||
ABC -> ABD switch is an example of that, because quorums AC and BD don't
|
|
||||||
intersect). Furthermore, procedure is typically carried out by the consensus
|
|
||||||
leader, and so enumeration of configurations which establishes order between
|
|
||||||
them is done through consensus log.
|
|
||||||
|
|
||||||
In our case consensus leader is compute (walproposer), and we don't want to wake
|
|
||||||
up all computes for the change. Neither we want to fully reimplement the leader
|
|
||||||
logic second time outside compute. Because of that the proposed algorithm relies
|
|
||||||
for issuing configurations on the external fault tolerant (distributed) strongly
|
|
||||||
consisent storage with simple API: CAS (compare-and-swap) on the single key.
|
|
||||||
Properly configured postgres suits this.
|
|
||||||
|
|
||||||
In the system consensus is implemented at the timeline level, so algorithm below
|
|
||||||
applies to the single timeline.
|
|
||||||
|
|
||||||
## Algorithm
|
|
||||||
|
|
||||||
### Definitions
|
|
||||||
|
|
||||||
A configuration is
|
|
||||||
|
|
||||||
```
|
|
||||||
struct Configuration {
|
|
||||||
generation: Generation, // a number uniquely identifying configuration
|
|
||||||
sk_set: Vec<NodeId>, // current safekeeper set
|
|
||||||
new_sk_set: Optional<Vec<NodeId>>,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Configuration with `new_set` present is used for the intermediate step during
|
|
||||||
the change and called joint configuration. Generations establish order of
|
|
||||||
generations: we say `c1` is higher than `c2` if `c1.generation` >
|
|
||||||
`c2.generation`.
|
|
||||||
|
|
||||||
### Persistently stored data changes
|
|
||||||
|
|
||||||
Safekeeper starts storing its current configuration in the control file. Update
|
|
||||||
of is atomic, so in-memory value always matches the persistent one.
|
|
||||||
|
|
||||||
External CAS providing storage (let's call it configuration storage here) also
|
|
||||||
stores configuration for each timeline. It is initialized with generation 1 and
|
|
||||||
initial set of safekeepers during timeline creation. Executed CAS on it must
|
|
||||||
never be lost.
|
|
||||||
|
|
||||||
### Compute <-> safekeeper protocol changes
|
|
||||||
|
|
||||||
`ProposerGreeting` message carries walproposer's configuration if it is already
|
|
||||||
established (see below), else null. `AcceptorGreeting` message carries
|
|
||||||
safekeeper's current `Configuration`. All further messages (`VoteRequest`,
|
|
||||||
`VoteResponse`, `ProposerElected`, `AppendRequest`, `AppendResponse`) carry
|
|
||||||
generation number, of walproposer in case of wp->sk message or of safekeeper in
|
|
||||||
case of sk->wp message.
|
|
||||||
|
|
||||||
### Safekeeper changes
|
|
||||||
|
|
||||||
Basic rule: once safekeeper observes configuration higher than his own it
|
|
||||||
immediately switches to it. It must refuse all messages with lower generation
|
|
||||||
that his. It also refuses messages if it is not member of the current generation
|
|
||||||
(that is, of either `sk_set` of `sk_new_set`), though it is likely not unsafe to
|
|
||||||
process them (walproposer should ignore result anyway).
|
|
||||||
|
|
||||||
If there is non null configuration in `ProposerGreeting` and it is higher than
|
|
||||||
current safekeeper one, safekeeper switches to it.
|
|
||||||
|
|
||||||
Safekeeper sends its current configuration in its first message to walproposer
|
|
||||||
`AcceptorGreeting`. It refuses all other walproposer messages if the
|
|
||||||
configuration generation in them is less than its current one. Namely, it
|
|
||||||
refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
|
|
||||||
response it sends its current configuration generation to let walproposer know.
|
|
||||||
|
|
||||||
Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
|
|
||||||
accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
|
|
||||||
current one and ignores it otherwise. In any case it replies with
|
|
||||||
```
|
|
||||||
struct ConfigurationSwitchResponse {
|
|
||||||
conf: Configuration,
|
|
||||||
term: Term,
|
|
||||||
last_log_term: Term,
|
|
||||||
flush_lsn: Lsn,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Compute (walproposer) changes
|
|
||||||
|
|
||||||
Basic rule is that joint configuration requires votes from majorities in the
|
|
||||||
both `set` and `new_sk_set`.
|
|
||||||
|
|
||||||
Compute receives list of safekeepers to connect to from the control plane as
|
|
||||||
currently and tries to communicate with all of them. However, the list does not
|
|
||||||
define consensus members. Instead, on start walproposer tracks highest
|
|
||||||
configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
|
|
||||||
from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
|
|
||||||
establishes this configuration as its own and moves to voting.
|
|
||||||
|
|
||||||
It should stop talking to safekeepers not listed in the configuration at this
|
|
||||||
point, though it is not unsafe to continue doing so.
|
|
||||||
|
|
||||||
To be elected it must receive votes from both majorites if `new_sk_set` is present.
|
|
||||||
Similarly, to commit WAL it must receive flush acknowledge from both majorities.
|
|
||||||
|
|
||||||
If walproposer hears from safekeeper configuration higher than his own (i.e.
|
|
||||||
refusal to accept due to configuration change) it simply restarts.
|
|
||||||
|
|
||||||
### Change algorithm
|
|
||||||
|
|
||||||
The following algorithm can be executed anywhere having access to configuration
|
|
||||||
storage and safekeepers. It is safe to interrupt / restart it and run multiple
|
|
||||||
instances of it concurrently, though likely one of them won't make
|
|
||||||
progress then. It accepts `desired_set: Vec<NodeId>` as input.
|
|
||||||
|
|
||||||
Algorithm will refuse to make the change if it encounters previous interrupted
|
|
||||||
change attempt, but in this case it will try to finish it.
|
|
||||||
|
|
||||||
It will eventually converge if old majority, new majority and configuration
|
|
||||||
storage are reachable.
|
|
||||||
|
|
||||||
1) Fetch current timeline configuration from the configuration storage.
|
|
||||||
2) If it is already joint one and `new_set` is different from `desired_set`
|
|
||||||
refuse to change. However, assign join conf to (in memory) var
|
|
||||||
`join_conf` and proceed to step 4 to finish the ongoing change.
|
|
||||||
3) Else, create joint `joint_conf: Configuration`: increment current conf number
|
|
||||||
`n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
|
|
||||||
storage by doing CAS on the current generation: change happens only if
|
|
||||||
current configuration number is still `n`. Apart from guaranteeing uniqueness
|
|
||||||
of configurations, CAS linearizes them, ensuring that new configuration is
|
|
||||||
created only following the previous one when we know that the transition is
|
|
||||||
safe. Failed CAS aborts the procedure.
|
|
||||||
4) Call `PUT` `configuration` on safekeepers from the current set,
|
|
||||||
delivering them `joint_conf`. Collecting responses from majority is required
|
|
||||||
to proceed. If any response returned generation higher than
|
|
||||||
`joint_conf.generation`, abort (another switch raced us). Otherwise, choose
|
|
||||||
max `<last_log_term, flush_lsn>` among responses and establish it as
|
|
||||||
(in memory) `sync_position`. Also choose max `term` and establish it as (in
|
|
||||||
memory) `sync_term`. We can't finish the switch until majority of the new set
|
|
||||||
catches up to this `sync_position` because data before it could be committed
|
|
||||||
without ack from the new set. Similarly, we'll bump term on new majority
|
|
||||||
to `sync_term` so that two computes with the same term are never elected.
|
|
||||||
4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
|
|
||||||
doesn't exist yet by doing `pull_timeline` from the majority of the
|
|
||||||
current set. Doing that on majority of `new_sk_set` is enough to
|
|
||||||
proceed, but it is reasonable to ensure that all `new_sk_set` members
|
|
||||||
are initialized -- if some of them are down why are we migrating there?
|
|
||||||
5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
|
|
||||||
Success on majority is enough.
|
|
||||||
6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
|
|
||||||
delivering them `joint_conf` and collecting their positions. This will
|
|
||||||
switch them to the `joint_conf` which generally won't be needed
|
|
||||||
because `pull_timeline` already includes it and plus additionally would be
|
|
||||||
broadcast by compute. More importantly, we may proceed to the next step
|
|
||||||
only when `<last_log_term, flush_lsn>` on the majority of the new set reached
|
|
||||||
`sync_position`. Similarly, on the happy path no waiting is not needed because
|
|
||||||
`pull_timeline` already includes it. However, we should double
|
|
||||||
check to be safe. For example, timeline could have been created earlier e.g.
|
|
||||||
manually or after try-to-migrate, abort, try-to-migrate-again sequence.
|
|
||||||
7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
|
|
||||||
safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
|
|
||||||
storage under one more CAS.
|
|
||||||
8) Call `PUT` `configuration` on safekeepers from the new set,
|
|
||||||
delivering them `new_conf`. It is enough to deliver it to the majority
|
|
||||||
of the new set; the rest can be updated by compute.
|
|
||||||
|
|
||||||
I haven't put huge effort to make the description above very precise, because it
|
|
||||||
is natural language prone to interpretations anyway. Instead I'd like to make TLA+
|
|
||||||
spec of it.
|
|
||||||
|
|
||||||
Description above focuses on safety. To make the flow practical and live, here a few more
|
|
||||||
considerations.
|
|
||||||
1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
|
|
||||||
step 3.
|
|
||||||
2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
|
|
||||||
it is safe to rollback to the old conf with one more CAS.
|
|
||||||
3) On step 4 timeline might be already created on members of the new set for various reasons;
|
|
||||||
the simplest is the procedure restart. There are more complicated scenarious like mentioned
|
|
||||||
in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
|
|
||||||
generations, so seems simpler to treat existing timeline as success. However, this also
|
|
||||||
has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
|
|
||||||
the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
|
|
||||||
I don't think we'll observe this in practice, but can add waking up compute if needed.
|
|
||||||
4) In the end timeline should be locally deleted on the safekeeper(s) which are
|
|
||||||
in the old set but not in the new one, unless they are unreachable. To be
|
|
||||||
safe this also should be done under generation number (deletion proceeds only if
|
|
||||||
current configuration is <= than one in request and safekeeper is not memeber of it).
|
|
||||||
5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
|
|
||||||
jump to step 7, using it as `new_conf`.
|
|
||||||
|
|
||||||
## Implementation
|
|
||||||
|
|
||||||
The procedure ought to be driven from somewhere. Obvious candidates are control
|
|
||||||
plane and storage_controller; and as each of them already has db we don't want
|
|
||||||
yet another storage. I propose to manage safekeepers in storage_controller
|
|
||||||
because 1) since it is in rust it simplifies simulation testing (more on this
|
|
||||||
below) 2) it already manages pageservers.
|
|
||||||
|
|
||||||
This assumes that migration will be fully usable only after we migrate all
|
|
||||||
tenants/timelines to storage_controller. It is discussible whether we want also
|
|
||||||
to manage pageserver attachments for all of these, but likely we do.
|
|
||||||
|
|
||||||
This requires us to define storcon <-> cplane interface.
|
|
||||||
|
|
||||||
### storage_controller <-> control plane interface
|
|
||||||
|
|
||||||
First of all, control plane should
|
|
||||||
[change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
|
|
||||||
storing safekeepers per timeline instead of per tenant because we can't migrate
|
|
||||||
tenants atomically.
|
|
||||||
|
|
||||||
The important question is how updated configuration is delivered from
|
|
||||||
storage_controller to control plane to provide it to computes. As always, there
|
|
||||||
are two options, pull and push. Let's do it the same push as with pageserver
|
|
||||||
`/notify-attach` because 1) it keeps storage_controller out of critical compute
|
|
||||||
start path 2) provides easier upgrade: there won't be such a thing as 'timeline
|
|
||||||
managed by control plane / storcon', cplane just takes the value out of its db
|
|
||||||
when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
|
|
||||||
control plane until it succeeds.
|
|
||||||
|
|
||||||
So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
|
|
||||||
updates it in the db if the provided conf generation is higher (the cplane db
|
|
||||||
should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
|
|
||||||
should update db which makes the call successful, and then try to schedule
|
|
||||||
`apply_config` if possible, it is ok if not. storage_controller
|
|
||||||
should rate limit calling the endpoint, but likely this won't be needed, as migration
|
|
||||||
throughput is limited by `pull_timeline`.
|
|
||||||
|
|
||||||
Timeline (branch) creation in cplane should call storage_controller POST
|
|
||||||
`tenant/:tenant_id/timeline` like it currently does for sharded tenants.
|
|
||||||
Response should be augmented with `safekeeper_conf: Configuration`. The call
|
|
||||||
should be retried until succeeds.
|
|
||||||
|
|
||||||
Timeline deletion and tenant deletion in cplane should call appropriate
|
|
||||||
storage_controller endpoints like it currently does for sharded tenants. The
|
|
||||||
calls should be retried until they succeed.
|
|
||||||
|
|
||||||
### storage_controller implementation
|
|
||||||
|
|
||||||
Current 'load everything on startup and keep in memory' easy design is fine.
|
|
||||||
Single timeline shouldn't take more than 100 bytes (it's 16 byte tenant_id, 16
|
|
||||||
byte timeline_id, int generation, vec of ~3 safekeeper ids plus some flags), so
|
|
||||||
10^6 of timelines shouldn't take more than 100MB.
|
|
||||||
|
|
||||||
Similar to pageserver attachment Intents storage_controller would have in-memory
|
|
||||||
`MigrationRequest` (or its absense) for each timeline and pool of tasks trying
|
|
||||||
to make these request reality; this ensures one instance of storage_controller
|
|
||||||
won't do several migrations on the same timeline concurrently. In the first
|
|
||||||
version it is simpler to have more manual control and no retries, i.e. migration
|
|
||||||
failure removes the request. Later we can build retries and automatic
|
|
||||||
scheduling/migration. `MigrationRequest` is
|
|
||||||
```
|
|
||||||
enum MigrationRequest {
|
|
||||||
To(Vec<NodeId>),
|
|
||||||
FinishPending,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
`FinishPending` requests to run the procedure to ensure state is clean: current
|
|
||||||
configuration is not joint and majority of safekeepers are aware of it, but do
|
|
||||||
not attempt to migrate anywhere. If current configuration fetched on step 1 is
|
|
||||||
not joint it jumps to step 7. It should be run at startup for all timelines (but
|
|
||||||
similarly, in the first version it is ok to trigger it manually).
|
|
||||||
|
|
||||||
#### Schema
|
|
||||||
|
|
||||||
`safekeepers` table mirroring current `nodes` should be added, except that for
|
|
||||||
`scheduling_policy` field (seems like `status` is a better name for it): it is enough
|
|
||||||
to have at least in the beginning only 3 fields: 1) `active` 2) `offline` 3)
|
|
||||||
`decomissioned`.
|
|
||||||
|
|
||||||
`timelines` table:
|
|
||||||
```
|
|
||||||
table! {
|
|
||||||
// timeline_id is primary key
|
|
||||||
timelines (tenant_id, timeline_id) {
|
|
||||||
timeline_id -> Varchar,
|
|
||||||
tenant_id -> Varchar,
|
|
||||||
generation -> Int4,
|
|
||||||
sk_set -> Array<Int4>, // list of safekeeper ids
|
|
||||||
new_sk_set -> Nullable<Array<Int4>>, // list of safekeeper ids, null if not joint conf
|
|
||||||
cplane_notified_generation -> Int4,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### API
|
|
||||||
|
|
||||||
Node management is similar to pageserver:
|
|
||||||
1) POST `/control/v1/safekeepers` upserts safekeeper.
|
|
||||||
2) GET `/control/v1/safekeepers` lists safekeepers.
|
|
||||||
3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
|
|
||||||
4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
|
|
||||||
`offline` or `decomissioned`. Initially it is simpler not to schedule any
|
|
||||||
migrations here.
|
|
||||||
|
|
||||||
Safekeeper deploy scripts should register safekeeper at storage_contorller as
|
|
||||||
they currently do with cplane, under the same id.
|
|
||||||
|
|
||||||
Timeline creation/deletion: already existing POST `tenant/:tenant_id/timeline`
|
|
||||||
would 1) choose initial set of safekeepers; 2) write to the db initial
|
|
||||||
`Configuration` with `INSERT ON CONFLICT DO NOTHING` returning existing row in
|
|
||||||
case of conflict; 3) create timeline on the majority of safekeepers (already
|
|
||||||
created is ok).
|
|
||||||
|
|
||||||
We don't want to block timeline creation when one safekeeper is down. Currently
|
|
||||||
this is solved by compute implicitly creating timeline on any safekeeper it is
|
|
||||||
connected to. This creates ugly timeline state on safekeeper when timeline is
|
|
||||||
created, but start LSN is not defined yet. It would be nice to remove this; to
|
|
||||||
do that, controller can in the background retry to create timeline on
|
|
||||||
safekeeper(s) which missed that during initial creation call. It can do that
|
|
||||||
through `pull_timeline` from majority so it doesn't need to remember
|
|
||||||
`parent_lsn` in its db.
|
|
||||||
|
|
||||||
Timeline deletion removes the row from the db and forwards deletion to the
|
|
||||||
current configuration members. Without additional actions deletions might leak,
|
|
||||||
see below on this; initially let's ignore these, reporting to cplane success if
|
|
||||||
at least one safekeeper deleted the timeline (this will remove s3 data).
|
|
||||||
|
|
||||||
Tenant deletion repeats timeline deletion for all timelines.
|
|
||||||
|
|
||||||
Migration API: the first version is the simplest and the most imperative:
|
|
||||||
1) PUT `/control/v1/safekeepers/migrate` schedules `MigrationRequest`s to move
|
|
||||||
all timelines from one safekeeper to another. It accepts json
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"src_sk": u32,
|
|
||||||
"dst_sk": u32,
|
|
||||||
"limit": Optional<u32>,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns list of scheduled requests.
|
|
||||||
|
|
||||||
2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
|
|
||||||
to move single timeline to given set of safekeepers:
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"desired_set": Vec<u32>,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns scheduled request.
|
|
||||||
|
|
||||||
Similar call should be added for the tenant.
|
|
||||||
|
|
||||||
It would be great to have some way of subscribing to the results (apart from
|
|
||||||
looking at logs/metrics).
|
|
||||||
|
|
||||||
Migration is executed as described above. One subtlety is that (local) deletion on
|
|
||||||
source safekeeper might fail, which is not a problem if we are going to
|
|
||||||
decomission the node but leaves garbage otherwise. I'd propose in the first version
|
|
||||||
1) Don't attempt deletion at all if node status is `offline`.
|
|
||||||
2) If it failed, just issue warning.
|
|
||||||
And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
|
|
||||||
remove garbage timelines for manual use. It will 1) list all timelines on the
|
|
||||||
safekeeper 2) compare each one against configuration storage: if timeline
|
|
||||||
doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
|
|
||||||
be deleted under generation number if node is not member of current generation.
|
|
||||||
|
|
||||||
Automating this is untrivial; we'd need to register all potential missing
|
|
||||||
deletions <tenant_id, timeline_id, generation, node_id> in the same transaction
|
|
||||||
which switches configurations. Similarly when timeline is fully deleted to
|
|
||||||
prevent cplane operation from blocking when some safekeeper is not available
|
|
||||||
deletion should be also registered.
|
|
||||||
|
|
||||||
One more task pool should infinitely retry notifying control plane about changed
|
|
||||||
safekeeper sets.
|
|
||||||
|
|
||||||
3) GET `/control/v1/tenant/:tenant_id/timeline/:timeline_id/` should return
|
|
||||||
current in memory state of the timeline and pending `MigrationRequest`,
|
|
||||||
if any.
|
|
||||||
|
|
||||||
4) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort` tries to abort the
|
|
||||||
migration by switching configuration from the joint to the one with (previous) `sk_set` under CAS
|
|
||||||
(incrementing generation as always).
|
|
||||||
|
|
||||||
#### Dealing with multiple instances of storage_controller
|
|
||||||
|
|
||||||
Operations described above executed concurrently might create some errors but do
|
|
||||||
not prevent progress, so while we normally don't want to run multiple instances
|
|
||||||
of storage_controller it is fine to have it temporarily, e.g. during redeploy.
|
|
||||||
|
|
||||||
Any interactions with db update in-memory controller state, e.g. if migration
|
|
||||||
request failed because different one is in progress, controller remembers that
|
|
||||||
and tries to finish it.
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
`neon_local` should be switched to use storage_controller, playing role of
|
|
||||||
control plane.
|
|
||||||
|
|
||||||
There should be following layers of tests:
|
|
||||||
1) Model checked TLA+ spec specifies the algorithm and verifies its basic safety.
|
|
||||||
|
|
||||||
2) To cover real code and at the same time test many schedules we should have
|
|
||||||
simulation tests. For that, configuration storage, storage_controller <->
|
|
||||||
safekeeper communication and pull_timeline need to be mocked and main switch
|
|
||||||
procedure wrapped to as a node (thread) in simulation tests, using these
|
|
||||||
mocks. Test would inject migrations like it currently injects
|
|
||||||
safekeeper/walproposer restars. Main assert is the same -- committed WAL must
|
|
||||||
not be lost.
|
|
||||||
|
|
||||||
3) Since simulation testing injects at relatively high level points (not
|
|
||||||
syscalls), it omits some code, in particular `pull_timeline`. Thus it is
|
|
||||||
better to have basic tests covering whole system as well. Extended version of
|
|
||||||
`test_restarts_under_load` would do: start background load and do migration
|
|
||||||
under it, then restart endpoint and check that no reported commits
|
|
||||||
had been lost. I'd also add one more creating classic network split scenario, with
|
|
||||||
one compute talking to AC and another to BD while migration from nodes ABC to ABD
|
|
||||||
happens.
|
|
||||||
|
|
||||||
4) Simple e2e test should ensure that full flow including cplane notification works.
|
|
||||||
|
|
||||||
## Order of implementation and rollout
|
|
||||||
|
|
||||||
Note that
|
|
||||||
- Control plane parts and integration with it is fully independent from everything else
|
|
||||||
(tests would use simulation and neon_local).
|
|
||||||
- There is a lot of infra work making storage_controller aware of timelines and safekeepers
|
|
||||||
and its impl/rollout should be separate from migration itself.
|
|
||||||
- Initially walproposer can just stop working while it observers joint configuration.
|
|
||||||
Such window would be typically very short anyway.
|
|
||||||
|
|
||||||
To rollout smoothly, both walproposer and safekeeper should have flag
|
|
||||||
`configurations_enabled`; when set to false, they would work as currently, i.e.
|
|
||||||
walproposer is able to commit on whatever safekeeper set it is provided. Until
|
|
||||||
all timelines are managed by storcon we'd need to use current script to migrate
|
|
||||||
and update/drop entries in the storage_controller database if it has any.
|
|
||||||
|
|
||||||
Safekeepers would need to be able to talk both current and new protocol version
|
|
||||||
with compute to reduce number of computes restarted in prod once v2 protocol is
|
|
||||||
deployed (though before completely switching we'd need to force this).
|
|
||||||
|
|
||||||
Let's have the following rollout order:
|
|
||||||
- storage_controller becomes aware of safekeepers;
|
|
||||||
- storage_controller gets timeline creation for new timelines and deletion requests, but
|
|
||||||
doesn't manage all timelines yet. Migration can be tested on these new timelines.
|
|
||||||
To keep control plane and storage_controller databases in sync while control
|
|
||||||
plane still chooses the safekeepers initially (until all timelines are imported
|
|
||||||
it can choose better), `TimelineCreateRequest` can get optional safekeepers
|
|
||||||
field with safekeepers chosen by cplane.
|
|
||||||
- Then we can import all existing timelines from control plane to
|
|
||||||
storage_controller and gradually enable configurations region by region.
|
|
||||||
|
|
||||||
|
|
||||||
Very rough implementation order:
|
|
||||||
- Add concept of configurations to safekeepers (including control file),
|
|
||||||
implement v3 protocol.
|
|
||||||
- Implement walproposer changes, including protocol.
|
|
||||||
- Implement storconn part. Use it in neon_local (and pytest).
|
|
||||||
- Make cplane store safekeepers per timeline instead of per tenant.
|
|
||||||
- Implement cplane/storcon integration. Route branch creation/deletion
|
|
||||||
through storcon. Then we can test migration of new branches.
|
|
||||||
- Finally import existing branches. Then we can drop cplane
|
|
||||||
safekeeper selection code. Gradually enable configurations at
|
|
||||||
computes and safekeepers. Before that, all computes must talk only
|
|
||||||
v3 protocol version.
|
|
||||||
|
|
||||||
## Integration with evicted timelines
|
|
||||||
|
|
||||||
Currently, `pull_timeline` doesn't work correctly with evicted timelines because
|
|
||||||
copy would point to original partial file. To fix let's just do s3 copy of the
|
|
||||||
file. It is a bit stupid as generally unnecessary work, but it makes sense to
|
|
||||||
implement proper migration before doing smarter timeline archival. [Issue](https://github.com/neondatabase/neon/issues/8542)
|
|
||||||
|
|
||||||
## Possible optimizations
|
|
||||||
|
|
||||||
Steps above suggest walproposer restart (with re-election) and thus reconnection
|
|
||||||
to safekeepers. Since by bumping term on new majority we ensure that leader
|
|
||||||
terms are unique even across generation switches it is possible to preserve
|
|
||||||
connections. However, it is more complicated, reconnection is very fast and it
|
|
||||||
is much more important to avoid compute restart than millisecond order of write
|
|
||||||
stall.
|
|
||||||
|
|
||||||
Multiple joint consensus: algorithm above rejects attempt to change membership
|
|
||||||
while another attempt is in progress. It is possible to overlay them and AFAIK
|
|
||||||
Aurora does this but similarly I don't think this is needed.
|
|
||||||
|
|
||||||
## Misc
|
|
||||||
|
|
||||||
We should use Compute <-> safekeeper protocol change to include other (long
|
|
||||||
yearned) modifications:
|
|
||||||
- send data in network order to make arm work.
|
|
||||||
- remove term_start_lsn from AppendRequest
|
|
||||||
- add horizon to TermHistory
|
|
||||||
- add to ProposerGreeting number of connection from this wp to sk
|
|
||||||
@@ -1,265 +0,0 @@
|
|||||||
# Physical Replication
|
|
||||||
|
|
||||||
This RFC is a bit special in that we have already implemented physical
|
|
||||||
replication a long time ago. However, we never properly wrote down all
|
|
||||||
the decisions and assumptions, and in the last months when more users
|
|
||||||
have started to use the feature, numerous issues have surfaced.
|
|
||||||
|
|
||||||
This RFC documents the design decisions that have been made.
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
PostgreSQL has a feature called streaming replication, where a replica
|
|
||||||
streams WAL from the primary and continuously applies it. It is also
|
|
||||||
known as "physical replication", to distinguish it from logical
|
|
||||||
replication. In PostgreSQL, a replica is initialized by taking a
|
|
||||||
physical backup of the primary. In Neon, the replica is initialized
|
|
||||||
from a slim "base backup" from the pageserver, just like a primary,
|
|
||||||
and the primary and the replicas connect to the same pageserver,
|
|
||||||
sharing the storage.
|
|
||||||
|
|
||||||
There are two kinds of read-only replicas in Neon:
|
|
||||||
- replicas that follow the primary, and
|
|
||||||
- "static" replicas that are pinned at a particular LSN.
|
|
||||||
|
|
||||||
A static replica is useful e.g. for performing time-travel queries and
|
|
||||||
running one-off slow queries without affecting the primary. A replica
|
|
||||||
that follows the primary can be used e.g. to scale out read-only
|
|
||||||
workloads.
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Read-only replicas allow offloading read-only queries. It's useful for
|
|
||||||
isolation, if you want to make sure that read-only queries don't
|
|
||||||
affect the primary, and it's also an easy way to provide guaranteed
|
|
||||||
read-only access to an application, without having to mess with access
|
|
||||||
controls.
|
|
||||||
|
|
||||||
## Non Goals (if relevant)
|
|
||||||
|
|
||||||
This RFC is all about WAL-based *physical* replication. Logical
|
|
||||||
replication is a different feature.
|
|
||||||
|
|
||||||
Neon also has the capability to launch "static" read-only nodes which
|
|
||||||
do not follow the primary, but are pinned to a particular LSN. They
|
|
||||||
can be used for long-running one-off queries, or for Point-in-time
|
|
||||||
queries. They work similarly to read replicas that follow the primary,
|
|
||||||
but some things are simpler: there are no concerns about cache
|
|
||||||
invalidation when the data changes on the primary, or worrying about
|
|
||||||
transactions that are in-progress on the primary.
|
|
||||||
|
|
||||||
## Impacted components (e.g. pageserver, safekeeper, console, etc)
|
|
||||||
|
|
||||||
- Control plane launches the replica
|
|
||||||
- Replica Postgres instance connects to the safekeepers, to stream the WAL
|
|
||||||
- The primary does not know about the standby, except for the hot standby feedback
|
|
||||||
- The primary and replicas all connect to the same pageservers
|
|
||||||
|
|
||||||
|
|
||||||
# Context
|
|
||||||
|
|
||||||
Some useful things to know about hot standby and replicas in
|
|
||||||
PostgreSQL.
|
|
||||||
|
|
||||||
## PostgreSQL startup sequence
|
|
||||||
|
|
||||||
"Running" and "start up" terms are little imprecise. PostgreSQL
|
|
||||||
replica startup goes through several stages:
|
|
||||||
|
|
||||||
1. First, the process is started up, and various initialization steps
|
|
||||||
are performed, like initializing shared memory. If you try to
|
|
||||||
connect to the server in this stage, you get an error: ERROR: the
|
|
||||||
database system is starting up. This stage happens very quickly, no
|
|
||||||
|
|
||||||
2. Then the server reads the checpoint record from the WAL and starts
|
|
||||||
the WAL replay starting from the checkpoint. This works differently
|
|
||||||
in Neon: we start the WAL replay at the basebackup LSN, not from a
|
|
||||||
checkpoint! If you connect to the server in this state, you get an
|
|
||||||
error: ERROR: the database system is not yet accepting
|
|
||||||
connections. We proceed to the next stage, when the WAL replay sees
|
|
||||||
a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
|
|
||||||
can allow us to move directly to next stage, with all the caveats
|
|
||||||
listed in this RFC.
|
|
||||||
|
|
||||||
3. When the running-xacts information is established, the server
|
|
||||||
starts to accept connections normally.
|
|
||||||
|
|
||||||
From PostgreSQL's point of view, the server is already running in
|
|
||||||
stage 2, even though it's not accepting connections yet. Our
|
|
||||||
`compute_ctl` does not consider it as running until stage 3. If the
|
|
||||||
transition from stage 2 to 3 doesn't happen fast enough, the control
|
|
||||||
plane will mark the start operation as failed.
|
|
||||||
|
|
||||||
|
|
||||||
## Decisions, Issues
|
|
||||||
|
|
||||||
### Cache invalidation in replica
|
|
||||||
|
|
||||||
When a read replica follows the primary in PostgreSQL, it needs to
|
|
||||||
stream all the WAL from the primary and apply all the records, to keep
|
|
||||||
the local copy of the data consistent with the primary. In Neon, the
|
|
||||||
replica can fetch the updated page versions from the pageserver, so
|
|
||||||
it's not necessary to apply all the WAL. However, it needs to ensure
|
|
||||||
that any pages that are currently in the Postgres buffer cache, or the
|
|
||||||
Local File Cache, are either updated, or thrown away so that the next
|
|
||||||
read of the page will fetch the latest version.
|
|
||||||
|
|
||||||
We choose to apply the WAL records for pages that are already in the
|
|
||||||
buffer cache, and skip records for other pages. Somewhat arbitrarily,
|
|
||||||
we also apply records affecting catalog relations, fetching the old
|
|
||||||
page version from the pageserver if necessary first. See
|
|
||||||
`neon_redo_read_buffer_filter()` function.
|
|
||||||
|
|
||||||
The replica wouldn't necessarily need to see all the WAL records, only
|
|
||||||
the records that apply to cached pages. For simplicity, we do stream
|
|
||||||
all the WAL to the replica, and the replica simply ignores WAL records
|
|
||||||
that require no action.
|
|
||||||
|
|
||||||
Like in PostgreSQL, the read replica maintains a "replay LSN", which
|
|
||||||
is the LSN up to which the replica has received and replayed the
|
|
||||||
WAL. The replica can lag behind the primary, if it cannot quite keep
|
|
||||||
up with the primary, or if a long-running query conflicts with changes
|
|
||||||
that are about to be applied, or even intentionally if the user wishes
|
|
||||||
to see delayed data (see recovery_min_apply_delay). It's important
|
|
||||||
that the replica sees a consistent view of the whole cluster at the
|
|
||||||
replay LSN, when it's lagging behind.
|
|
||||||
|
|
||||||
In Neon, the replica connects to a safekeeper to get the WAL
|
|
||||||
stream. That means that the safekeepers must be able to regurgitate
|
|
||||||
the original WAL as far back as the replay LSN of any running read
|
|
||||||
replica. (A static read-only node that does not follow the primary
|
|
||||||
does not require a WAL stream however). The primary does not need to
|
|
||||||
be running, and when it is, the replicas don't incur any extra
|
|
||||||
overhead to the primary (see hot standby feedback though).
|
|
||||||
|
|
||||||
### In-progress transactions
|
|
||||||
|
|
||||||
In PostgreSQL, when a hot standby server starts up, it cannot
|
|
||||||
immediately open up for queries (see [PostgreSQL startup
|
|
||||||
sequence]). It first needs to establish a complete list of in-progress
|
|
||||||
transactions, including subtransactions, that are running at the
|
|
||||||
primary, at the current replay LSN. Normally that happens quickly,
|
|
||||||
when the replica sees a "running-xacts" WAL record, because the
|
|
||||||
primary writes a running-xacts WAL record at every checkpoint, and in
|
|
||||||
PostgreSQL the replica always starts the WAL replay from a checkpoint
|
|
||||||
REDO point. (A shutdown checkpoint WAL record also implies that all
|
|
||||||
the non-prepared transactions have ended.) If there are a lot of
|
|
||||||
subtransactions in progress, however, the standby might need to wait
|
|
||||||
for old transactions to complete before it can open up for queries.
|
|
||||||
|
|
||||||
In Neon that problem is worse: a replica can start at any LSN, so
|
|
||||||
there's no guarantee that it will see a running-xacts record any time
|
|
||||||
soon. In particular, if the primary is not running when the replica is
|
|
||||||
started, it might never see a running-xacts record.
|
|
||||||
|
|
||||||
To make things worse, we initially missed this issue, and always
|
|
||||||
started accepting queries at replica startup, even if it didn't have
|
|
||||||
the transaction information. That could lead to incorrect query
|
|
||||||
results and data corruption later. However, as we fixed that, we
|
|
||||||
introduced a new problem compared to what we had before: previously
|
|
||||||
the replica would always start up, but after fixing that bug, it might
|
|
||||||
not. In a superficial way, the old behavior was better (but could lead
|
|
||||||
to serious issues later!). That made fixing that bug was very hard,
|
|
||||||
because as we fixed it, we made things (superficially) worse for
|
|
||||||
others.
|
|
||||||
|
|
||||||
See https://github.com/neondatabase/neon/pull/7288 which fixed the
|
|
||||||
bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
|
|
||||||
and https://github.com/neondatabase/neon/pull/8484 to try to claw back
|
|
||||||
the cases that started to cause trouble as fixing it. As of this
|
|
||||||
writing, there are still cases where a replica might not immediately
|
|
||||||
start up, causing the control plane operation to fail, the remaining
|
|
||||||
issues are tracked in https://github.com/neondatabase/neon/issues/6211.
|
|
||||||
|
|
||||||
One long-term fix for this is to switch to using so-called CSN
|
|
||||||
snapshots in read replica. That would make it unnecessary to have the
|
|
||||||
full in-progress transaction list in the replica at startup time. See
|
|
||||||
https://commitfest.postgresql.org/48/4912/ for a work-in-progress
|
|
||||||
patch to upstream to implement that.
|
|
||||||
|
|
||||||
Another thing we could do is to teach the control plane about that
|
|
||||||
distinction between "starting up" and "running but haven't received
|
|
||||||
running-xacts information yet", so that we could keep the replica
|
|
||||||
waiting longer in that stage, and also give any client connections the
|
|
||||||
same `ERROR: the database system is not yet accepting connections`
|
|
||||||
error that you get in standalone PostgreSQL in that state.
|
|
||||||
|
|
||||||
|
|
||||||
### Recovery conflicts and Hot standby feedback
|
|
||||||
|
|
||||||
It's possible that a tuple version is vacuumed away in the primary,
|
|
||||||
even though it is still needed by a running transactions in the
|
|
||||||
replica. This is called a "recovery conflict", and PostgreSQL provides
|
|
||||||
various options for dealing with it. By default, the WAL replay will
|
|
||||||
wait up to 30 s for the conflicting query to finish. After that, it
|
|
||||||
will kill the running query, so that the WAL replay can proceed.
|
|
||||||
|
|
||||||
Another way to avoid the situation is to enable the
|
|
||||||
[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
|
|
||||||
option. When it is enabled, the primary will refrain from vacuuming
|
|
||||||
tuples that are still needed in the primary. That means potentially
|
|
||||||
bloating the primary, which violates the usual rule that read replicas
|
|
||||||
don't affect the operations on the primary, which is why it's off by
|
|
||||||
default. We leave it to users to decide if they want to turn it on,
|
|
||||||
same as PostgreSQL.
|
|
||||||
|
|
||||||
Neon supports `hot_standby_feedback` by passing the feedback messages
|
|
||||||
from the replica to the safekeepers, and from safekeepers to the
|
|
||||||
primary.
|
|
||||||
|
|
||||||
### Relationship of settings between primary and replica
|
|
||||||
|
|
||||||
In order to enter hot standby mode, some configuration options need to
|
|
||||||
be set to the same or larger values in the standby, compared to the
|
|
||||||
primary. See [explanation in the PostgreSQL
|
|
||||||
docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
|
|
||||||
|
|
||||||
In Neon, we have this problem too. To prevent customers from hitting
|
|
||||||
it, the control plane automatically adjusts the settings of a replica,
|
|
||||||
so that they match or exceed the primary's settings (see
|
|
||||||
https://github.com/neondatabase/cloud/issues/14903). However, you
|
|
||||||
can still hit the issue if the primary is restarted with larger
|
|
||||||
settings, while the replica is running.
|
|
||||||
|
|
||||||
|
|
||||||
### Interaction with Pageserver GC
|
|
||||||
|
|
||||||
The read replica can lag behind the primary. If there are recovery
|
|
||||||
conflicts or the replica cannot keep up for some reason, the lag can
|
|
||||||
in principle grow indefinitely. The replica will issue all GetPage
|
|
||||||
requests to the pageservers at the current replay LSN, and needs to
|
|
||||||
see the old page versions.
|
|
||||||
|
|
||||||
If the retention period in the pageserver is set to be small, it may
|
|
||||||
have already garbage collected away the old page versions. That will
|
|
||||||
cause read errors in the compute, and can mean that the replica cannot
|
|
||||||
make progress with the replication anymore.
|
|
||||||
|
|
||||||
There is a mechanism for replica to pass information about its replay
|
|
||||||
LSN to the pageserver, so that the pageserver refrains from GC'ing
|
|
||||||
data that is still needed by the standby. It's called
|
|
||||||
'standby_horizon' in the pageserver code, see
|
|
||||||
https://github.com/neondatabase/neon/pull/7368. A separate "lease"
|
|
||||||
mechanism also is in the works, where the replica could hold a lease
|
|
||||||
on the old LSN, preventing the pageserver from advancing the GC
|
|
||||||
horizon past that point. The difference is that the standby_horizon
|
|
||||||
mechanism relies on a feedback message from replica to safekeeper,
|
|
||||||
while the least API is exposed directly from the pageserver. A static
|
|
||||||
read-only node is not connected to safekeepers, so it cannot use the
|
|
||||||
standby_horizon mechanism.
|
|
||||||
|
|
||||||
|
|
||||||
### Synchronous replication
|
|
||||||
|
|
||||||
We haven't put any effort into synchronous replication yet.
|
|
||||||
|
|
||||||
PostgreSQL provides multiple levels of synchronicity. In the weaker
|
|
||||||
levels, a transaction is not acknowledged as committed to the client
|
|
||||||
in the primary until the WAL has been streamed to a replica or flushed
|
|
||||||
to disk there. Those modes don't make senses in Neon, because the
|
|
||||||
safekeepers handle durability.
|
|
||||||
|
|
||||||
`synchronous_commit=remote_apply` mode would make sense. In that mode,
|
|
||||||
the commit is not acknowledged to the client until it has been
|
|
||||||
replayed in the replica. That ensures that after commit, you can see
|
|
||||||
the commit in the replica too (aka. read-your-write consistency).
|
|
||||||
@@ -1,259 +0,0 @@
|
|||||||
# Rolling Storage Controller Restarts
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
This RFC describes the issues around the current storage controller restart procedure
|
|
||||||
and describes an implementation which reduces downtime to a few milliseconds on the happy path.
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
|
|
||||||
While the storage controller does not sit on the main data path, it's generally not acceptable
|
|
||||||
to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
|
|
||||||
|
|
||||||
### Current Implementation
|
|
||||||
|
|
||||||
The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
|
|
||||||
In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
|
|
||||||
a new instance is created.
|
|
||||||
|
|
||||||
At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
|
|
||||||
latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
|
|
||||||
under unfavourable circumstances: pageservers are heavily loaded or unavailable.
|
|
||||||
|
|
||||||
## Prior Art
|
|
||||||
|
|
||||||
There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
|
|
||||||
* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
|
|
||||||
For fail-over, traffic is routed to one of the standbys (which becomes active).
|
|
||||||
* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
|
|
||||||
and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
|
|
||||||
* Reduce storage controller unavailability during upgrades to milliseconds
|
|
||||||
* Minimize the interval in which it's possible for more than one storage controller
|
|
||||||
to issue reconciles.
|
|
||||||
* Have one uniform implementation for restarts and upgrades
|
|
||||||
* Fit in with the current Kubernetes deployment scheme
|
|
||||||
|
|
||||||
## Non Goals
|
|
||||||
|
|
||||||
* Implement our own consensus algorithm from scratch
|
|
||||||
* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
|
|
||||||
like a transient error to the control plane
|
|
||||||
|
|
||||||
## Impacted Components
|
|
||||||
|
|
||||||
* storage controller
|
|
||||||
* deployment orchestration (i.e. Ansible)
|
|
||||||
* helm charts
|
|
||||||
|
|
||||||
## Terminology
|
|
||||||
|
|
||||||
* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
|
|
||||||
at start-up by quering pageservers
|
|
||||||
* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
|
|
||||||
a set of replicas
|
|
||||||
|
|
||||||
## Implementation
|
|
||||||
|
|
||||||
### High Level Flow
|
|
||||||
|
|
||||||
At a very high level the proposed idea is to start a new storage controller instance while
|
|
||||||
the previous one is still running and cut-over to it when it becomes ready. The new instance,
|
|
||||||
should coordinate with the existing one and transition responsibility gracefully. While the controller
|
|
||||||
has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
|
|
||||||
scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
|
|
||||||
were operating at the same time and require operator intervention to remedy.
|
|
||||||
|
|
||||||
### Kubernetes Deployment Configuration
|
|
||||||
|
|
||||||
On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
|
|
||||||
to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
|
|
||||||
Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
|
|
||||||
scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
|
|
||||||
|
|
||||||
The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
|
|
||||||
|
|
||||||
### Storage Controller Start-Up
|
|
||||||
|
|
||||||
This section describes the primitives required on the storage controller side and the flow of the happy path.
|
|
||||||
|
|
||||||
#### Database Table For Leader Synchronization
|
|
||||||
|
|
||||||
A new table should be added to the storage controller database for leader synchronization during startup.
|
|
||||||
This table will always contain at most one row. The proposed name for the table is `leader` and the schema
|
|
||||||
contains two elements:
|
|
||||||
* `hostname`: represents the hostname for the current storage controller leader - should be addressible
|
|
||||||
from other pods in the deployment
|
|
||||||
* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
|
|
||||||
for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
|
|
||||||
|
|
||||||
Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
|
|
||||||
at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
|
|
||||||
situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
|
|
||||||
level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
|
|
||||||
READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
|
|
||||||
the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
|
|
||||||
our needs here.
|
|
||||||
|
|
||||||
```
|
|
||||||
START TRANSACTION ISOLATION LEVEL REPEATABLE READ
|
|
||||||
UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
|
|
||||||
WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
|
|
||||||
```
|
|
||||||
|
|
||||||
If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
|
|
||||||
|
|
||||||
#### Step Down API
|
|
||||||
|
|
||||||
A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
|
|
||||||
request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
|
|
||||||
and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
|
|
||||||
snapshot of the observed state.
|
|
||||||
|
|
||||||
If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
|
|
||||||
for failure scenario handling - see [Handling Failures](#handling-failures)).
|
|
||||||
|
|
||||||
#### Graceful Restart Happy Path
|
|
||||||
|
|
||||||
At start-up, the first thing the storage controller does is retrieve the sole row from the new
|
|
||||||
`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
|
|
||||||
This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
|
|
||||||
observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
|
|
||||||
pageservers in order to build up the observed state.
|
|
||||||
|
|
||||||
Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
|
|
||||||
section. If this step fails, the storage controller process exits.
|
|
||||||
|
|
||||||
Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
|
|
||||||
(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
|
|
||||||
|
|
||||||
Summary of proposed new start-up sequence:
|
|
||||||
1. Call `/step_down`
|
|
||||||
2. Perform any pending database migrations
|
|
||||||
3. Load state from database
|
|
||||||
4. Load observed state returned in step (1) into memory
|
|
||||||
5. Do initial heartbeat round (may be moved after 5)
|
|
||||||
7. Mark self as leader by updating the database
|
|
||||||
8. Reschedule and reconcile everything
|
|
||||||
|
|
||||||
Some things to note from the steps above:
|
|
||||||
* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
|
|
||||||
calls to the pageserver and no compute notifications)
|
|
||||||
* Ask the current leader to step down before loading state from database so we don't get a lost update
|
|
||||||
if the transactions overlap.
|
|
||||||
* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
|
|
||||||
fall back to asking the pageservers about their current locations.
|
|
||||||
* Database migrations should only run **after** the previous instance steps down (or the step down times out).
|
|
||||||
|
|
||||||
|
|
||||||
[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
|
|
||||||
so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
|
|
||||||
|
|
||||||
### Handling Failures
|
|
||||||
|
|
||||||
#### Storage Controller Crash Or Restart
|
|
||||||
|
|
||||||
The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
|
|
||||||
`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
|
|
||||||
start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
|
|
||||||
exists and consistency is maintained.
|
|
||||||
|
|
||||||
#### Previous Leader Crashes Before New Leader Readiness
|
|
||||||
|
|
||||||
When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
|
|
||||||
reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
|
|
||||||
(see [2]).
|
|
||||||
|
|
||||||
Now we have two cases to consider:
|
|
||||||
* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
|
|
||||||
by Kubernetes depending on timings.
|
|
||||||
* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
|
|
||||||
The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
|
|
||||||
create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
|
|
||||||
|
|
||||||
[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
|
|
||||||
should avoid this self reference and fail the API call at the client if the persisted hostname matches
|
|
||||||
the current one.
|
|
||||||
|
|
||||||
#### Previous Leader Crashes After New Leader Readiness
|
|
||||||
|
|
||||||
The deployment's replica sets already satisfy the deployment's replica count requirements and the
|
|
||||||
Kubernetes deployment rollout will just clean up the dead pod.
|
|
||||||
|
|
||||||
#### New Leader Crashes Before Pasing Readiness Check
|
|
||||||
|
|
||||||
The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
|
|
||||||
with the new pod.
|
|
||||||
|
|
||||||
#### Network Partition Between New Pod and Previous Leader
|
|
||||||
|
|
||||||
This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
|
|
||||||
API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
|
|
||||||
Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
|
|
||||||
|
|
||||||
### Dealing With Split Brain Scenarios
|
|
||||||
|
|
||||||
As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
|
|
||||||
duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
|
|
||||||
scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
|
|
||||||
The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
|
|
||||||
|
|
||||||
### Ensure Leadership Before Producing Side Effects
|
|
||||||
|
|
||||||
The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
|
|
||||||
Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
|
|
||||||
applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
|
|
||||||
|
|
||||||
### Leadership Lease
|
|
||||||
|
|
||||||
Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
|
|
||||||
to be renewed periodically. Two new columns would be added to the leaders table:
|
|
||||||
1. `last_renewed` - timestamp indicating when the lease was last renewed
|
|
||||||
2. `lease_duration` - duration indicating the amount of time after which the lease expires
|
|
||||||
|
|
||||||
The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
|
|
||||||
same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
|
|
||||||
to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
|
|
||||||
|
|
||||||
### Notify Pageserver Of Storage Controller Term
|
|
||||||
|
|
||||||
Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
|
|
||||||
Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
|
|
||||||
anything which contains a stale term (i.e. smaller than the current one).
|
|
||||||
|
|
||||||
### Observability
|
|
||||||
|
|
||||||
* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
|
|
||||||
Per region alerts should be added on this metric which triggers when:
|
|
||||||
+ no storage controller has been in the `Active` state for an extended period of time
|
|
||||||
+ more than one storage controllers are in the `Active` state
|
|
||||||
|
|
||||||
* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
|
|
||||||
We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
|
|
||||||
|
|
||||||
## Alternatives
|
|
||||||
|
|
||||||
### Kubernetes Leases
|
|
||||||
|
|
||||||
Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
|
|
||||||
Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
|
|
||||||
|
|
||||||
In our case, it would work something like this:
|
|
||||||
* `/step_down` deletes the lease or stops it from renewing
|
|
||||||
* lease acquisition becomes part of the start-up procedure
|
|
||||||
|
|
||||||
The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
|
|
||||||
not exactly trivial to implement.
|
|
||||||
|
|
||||||
This approach has the benefit of baked in observability (`kubectl describe lease`), but:
|
|
||||||
* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
|
|
||||||
* More code surface than the simple "row in database" approach. Also, most of this code would be in
|
|
||||||
a dependency not subject to code review, etc.
|
|
||||||
* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
|
|
||||||
so is not simple and complictes and the test set-up.
|
|
||||||
|
|
||||||
To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
|
|
||||||
to something external.
|
|
||||||
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
|
|||||||
1. Create a new branch based on the stable branch you are updating.
|
1. Create a new branch based on the stable branch you are updating.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git checkout -b my-branch-15 REL_15_STABLE_neon
|
git checkout -b my-branch REL_15_STABLE_neon
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
|
1. Tag the last commit on the stable branch you are updating.
|
||||||
|
|
||||||
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
|
```shell
|
||||||
|
git tag REL_15_3_neon
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Push the new tag to the Neon Postgres repository.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
git push origin REL_15_3_neon
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||||
|
|
||||||
|
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git fetch upstream REL_15_4
|
git fetch upstream REL_15_4
|
||||||
git merge REL_15_4
|
git rebase REL_15_4
|
||||||
```
|
```
|
||||||
|
|
||||||
In the commit message of the merge commit, mention if there were
|
|
||||||
any non-trivial conflicts or other issues.
|
|
||||||
|
|
||||||
1. Run the Postgres test suite to make sure our commits have not affected
|
1. Run the Postgres test suite to make sure our commits have not affected
|
||||||
Postgres in a negative way.
|
Postgres in a negative way.
|
||||||
|
|
||||||
@@ -48,7 +57,7 @@ Postgres in a negative way.
|
|||||||
1. Push your branch to the Neon Postgres repository.
|
1. Push your branch to the Neon Postgres repository.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git push origin my-branch-15
|
git push origin my-branch
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Clone the Neon repository if you have not done so already.
|
1. Clone the Neon repository if you have not done so already.
|
||||||
@@ -65,7 +74,7 @@ branch.
|
|||||||
1. Update the Git submodule.
|
1. Update the Git submodule.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
|
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||||
git submodule update --remote vendor/postgres-v15
|
git submodule update --remote vendor/postgres-v15
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -80,12 +89,14 @@ minor Postgres release.
|
|||||||
|
|
||||||
1. Create a pull request, and wait for CI to go green.
|
1. Create a pull request, and wait for CI to go green.
|
||||||
|
|
||||||
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
|
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
git push origin my-branch-15:REL_15_STABLE_neon
|
git push --force origin my-branch:REL_15_STABLE_neon
|
||||||
```
|
```
|
||||||
|
|
||||||
|
It may require disabling various branch protections.
|
||||||
|
|
||||||
1. Update your Neon PR to point at the branches.
|
1. Update your Neon PR to point at the branches.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
|||||||
@@ -14,3 +14,5 @@ regex.workspace = true
|
|||||||
|
|
||||||
utils = { path = "../utils" }
|
utils = { path = "../utils" }
|
||||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -6,8 +6,10 @@ license = "Apache-2.0"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
chrono = { workspace = true, features = ["serde"] }
|
chrono.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -14,3 +14,5 @@ parking_lot.workspace = true
|
|||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
smallvec = { workspace = true, features = ["write"] }
|
smallvec = { workspace = true, features = ["write"] }
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ chrono.workspace = true
|
|||||||
twox-hash.workspace = true
|
twox-hash.workspace = true
|
||||||
measured.workspace = true
|
measured.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[target.'cfg(target_os = "linux")'.dependencies]
|
[target.'cfg(target_os = "linux")'.dependencies]
|
||||||
procfs.workspace = true
|
procfs.workspace = true
|
||||||
measured-process.workspace = true
|
measured-process.workspace = true
|
||||||
|
|||||||
@@ -21,9 +21,11 @@ hex.workspace = true
|
|||||||
humantime.workspace = true
|
humantime.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
humantime-serde.workspace = true
|
humantime-serde.workspace = true
|
||||||
chrono = { workspace = true, features = ["serde"] }
|
chrono.workspace = true
|
||||||
itertools.workspace = true
|
itertools.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId};
|
use utils::id::{NodeId, TenantId};
|
||||||
|
|
||||||
use crate::models::PageserverUtilization;
|
|
||||||
use crate::{
|
use crate::{
|
||||||
models::{ShardParameters, TenantConfig},
|
models::{ShardParameters, TenantConfig},
|
||||||
shard::{ShardStripeSize, TenantShardId},
|
shard::{ShardStripeSize, TenantShardId},
|
||||||
@@ -56,8 +55,6 @@ pub struct NodeRegisterRequest {
|
|||||||
|
|
||||||
pub listen_http_addr: String,
|
pub listen_http_addr: String,
|
||||||
pub listen_http_port: u16,
|
pub listen_http_port: u16,
|
||||||
|
|
||||||
pub availability_zone_id: Option<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
#[derive(Serialize, Deserialize)]
|
||||||
@@ -143,11 +140,23 @@ pub struct TenantShardMigrateRequest {
|
|||||||
pub node_id: NodeId,
|
pub node_id: NodeId,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Clone, Debug)]
|
/// Utilisation score indicating how good a candidate a pageserver
|
||||||
|
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||||
|
/// Lower values are better.
|
||||||
|
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||||
|
pub struct UtilizationScore(pub u64);
|
||||||
|
|
||||||
|
impl UtilizationScore {
|
||||||
|
pub fn worst() -> Self {
|
||||||
|
UtilizationScore(u64::MAX)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Clone, Copy, Debug)]
|
||||||
#[serde(into = "NodeAvailabilityWrapper")]
|
#[serde(into = "NodeAvailabilityWrapper")]
|
||||||
pub enum NodeAvailability {
|
pub enum NodeAvailability {
|
||||||
// Normal, happy state
|
// Normal, happy state
|
||||||
Active(PageserverUtilization),
|
Active(UtilizationScore),
|
||||||
// Node is warming up, but we expect it to become available soon. Covers
|
// Node is warming up, but we expect it to become available soon. Covers
|
||||||
// the time span between the re-attach response being composed on the storage controller
|
// the time span between the re-attach response being composed on the storage controller
|
||||||
// and the first successful heartbeat after the processing of the re-attach response
|
// and the first successful heartbeat after the processing of the re-attach response
|
||||||
@@ -186,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
|||||||
match val {
|
match val {
|
||||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||||
// the heartbeats.
|
// the heartbeats.
|
||||||
NodeAvailabilityWrapper::Active => {
|
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||||
NodeAvailability::Active(PageserverUtilization::full())
|
|
||||||
}
|
|
||||||
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
||||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||||
}
|
}
|
||||||
@@ -306,17 +313,20 @@ pub struct MetadataHealthUpdateRequest {
|
|||||||
pub struct MetadataHealthUpdateResponse {}
|
pub struct MetadataHealthUpdateResponse {}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
pub struct MetadataHealthListUnhealthyResponse {
|
pub struct MetadataHealthListUnhealthyResponse {
|
||||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
pub struct MetadataHealthListOutdatedRequest {
|
pub struct MetadataHealthListOutdatedRequest {
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
pub not_scrubbed_for: Duration,
|
pub not_scrubbed_for: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
|
|
||||||
pub struct MetadataHealthListOutdatedResponse {
|
pub struct MetadataHealthListOutdatedResponse {
|
||||||
pub health_records: Vec<MetadataHealthRecord>,
|
pub health_records: Vec<MetadataHealthRecord>,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,11 +22,6 @@ pub struct Key {
|
|||||||
pub field6: u32,
|
pub field6: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
|
|
||||||
/// a struct of fields.
|
|
||||||
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd)]
|
|
||||||
pub struct CompactKey(i128);
|
|
||||||
|
|
||||||
/// The storage key size.
|
/// The storage key size.
|
||||||
pub const KEY_SIZE: usize = 18;
|
pub const KEY_SIZE: usize = 18;
|
||||||
|
|
||||||
@@ -108,41 +103,14 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This function checks more extensively what keys we can take on the write path.
|
|
||||||
/// If a key beginning with 00 does not have a global/default tablespace OID, it
|
|
||||||
/// will be rejected on the write path.
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn is_valid_key_on_write_path_strong(&self) -> bool {
|
|
||||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
|
||||||
if !self.is_i128_representable() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if self.field1 == 0
|
|
||||||
&& !(self.field2 == GLOBALTABLESPACE_OID
|
|
||||||
|| self.field2 == DEFAULTTABLESPACE_OID
|
|
||||||
|| self.field2 == 0)
|
|
||||||
{
|
|
||||||
return false; // User defined tablespaces are not supported
|
|
||||||
}
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
|
|
||||||
/// checks if the key is i128 representable. Note that some keys can be successfully
|
|
||||||
/// ingested into the pageserver, but will cause errors on generating basebackup.
|
|
||||||
pub fn is_valid_key_on_write_path(&self) -> bool {
|
|
||||||
self.is_i128_representable()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_i128_representable(&self) -> bool {
|
|
||||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
|
|
||||||
}
|
|
||||||
|
|
||||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||||
pub fn to_i128(&self) -> i128 {
|
pub fn to_i128(&self) -> i128 {
|
||||||
assert!(self.is_i128_representable(), "invalid key: {self}");
|
assert!(
|
||||||
|
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
|
||||||
|
"invalid key: {self}",
|
||||||
|
);
|
||||||
(((self.field1 & 0x7F) as i128) << 120)
|
(((self.field1 & 0x7F) as i128) << 120)
|
||||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||||
| ((self.field3 as i128) << 72)
|
| ((self.field3 as i128) << 72)
|
||||||
@@ -162,14 +130,6 @@ impl Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn to_compact(&self) -> CompactKey {
|
|
||||||
CompactKey(self.to_i128())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_compact(k: CompactKey) -> Self {
|
|
||||||
Self::from_i128(k.0)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const fn next(&self) -> Key {
|
pub const fn next(&self) -> Key {
|
||||||
self.add(1)
|
self.add(1)
|
||||||
}
|
}
|
||||||
@@ -239,13 +199,6 @@ impl fmt::Display for Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for CompactKey {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
let k = Key::from_compact(*self);
|
|
||||||
k.fmt(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Key {
|
impl Key {
|
||||||
pub const MIN: Key = Key {
|
pub const MIN: Key = Key {
|
||||||
field1: u8::MIN,
|
field1: u8::MIN,
|
||||||
@@ -263,15 +216,6 @@ impl Key {
|
|||||||
field5: u8::MAX,
|
field5: u8::MAX,
|
||||||
field6: u32::MAX,
|
field6: u32::MAX,
|
||||||
};
|
};
|
||||||
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
|
|
||||||
pub const NON_L0_MAX: Key = Key {
|
|
||||||
field1: u8::MAX,
|
|
||||||
field2: u32::MAX,
|
|
||||||
field3: u32::MAX,
|
|
||||||
field4: u32::MAX,
|
|
||||||
field5: u8::MAX,
|
|
||||||
field6: u32::MAX - 1,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub fn from_hex(s: &str) -> Result<Self> {
|
pub fn from_hex(s: &str) -> Result<Self> {
|
||||||
if s.len() != 36 {
|
if s.len() != 36 {
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ pub struct ShardedRange<'a> {
|
|||||||
|
|
||||||
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||||
// top page in the previous relation's space.
|
// top page in the previous relation's space.
|
||||||
pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||||
debug_assert!(is_contiguous_range(range));
|
debug_assert!(is_contiguous_range(range));
|
||||||
if range.start.field6 == 0xffffffff {
|
if range.start.field6 == 0xffffffff {
|
||||||
range.end.field6 + 1
|
range.end.field6 + 1
|
||||||
@@ -67,7 +67,7 @@ pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
|||||||
/// This matters, because:
|
/// This matters, because:
|
||||||
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||||
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||||
pub fn is_contiguous_range(range: &Range<Key>) -> bool {
|
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||||
range.start.field1 == range.end.field1
|
range.start.field1 == range.end.field1
|
||||||
&& range.start.field2 == range.end.field2
|
&& range.start.field2 == range.end.field2
|
||||||
&& range.start.field3 == range.end.field3
|
&& range.start.field3 == range.end.field3
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
|
|||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
io::{BufRead, Read},
|
io::{BufRead, Read},
|
||||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
num::{NonZeroU64, NonZeroUsize},
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::atomic::AtomicUsize,
|
sync::atomic::AtomicUsize,
|
||||||
time::{Duration, SystemTime},
|
time::{Duration, SystemTime},
|
||||||
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
|
|||||||
|
|
||||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||||
pub fn default_tenant_config() -> Self {
|
pub fn default_tenant_config() -> Self {
|
||||||
Self::V2
|
Self::V1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -486,11 +486,12 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
|||||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||||
pub struct ThrottleConfig {
|
pub struct ThrottleConfig {
|
||||||
pub task_kinds: Vec<String>, // TaskKind
|
pub task_kinds: Vec<String>, // TaskKind
|
||||||
pub initial: u32,
|
pub initial: usize,
|
||||||
#[serde(with = "humantime_serde")]
|
#[serde(with = "humantime_serde")]
|
||||||
pub refill_interval: Duration,
|
pub refill_interval: Duration,
|
||||||
pub refill_amount: NonZeroU32,
|
pub refill_amount: NonZeroUsize,
|
||||||
pub max: u32,
|
pub max: usize,
|
||||||
|
pub fair: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ThrottleConfig {
|
impl ThrottleConfig {
|
||||||
@@ -500,8 +501,9 @@ impl ThrottleConfig {
|
|||||||
// other values don't matter with emtpy `task_kinds`.
|
// other values don't matter with emtpy `task_kinds`.
|
||||||
initial: 0,
|
initial: 0,
|
||||||
refill_interval: Duration::from_millis(1),
|
refill_interval: Duration::from_millis(1),
|
||||||
refill_amount: NonZeroU32::new(1).unwrap(),
|
refill_amount: NonZeroUsize::new(1).unwrap(),
|
||||||
max: 1,
|
max: 1,
|
||||||
|
fair: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/// The requests per second allowed by the given config.
|
/// The requests per second allowed by the given config.
|
||||||
@@ -716,7 +718,6 @@ pub struct TimelineInfo {
|
|||||||
pub pg_version: u32,
|
pub pg_version: u32,
|
||||||
|
|
||||||
pub state: TimelineState,
|
pub state: TimelineState,
|
||||||
pub is_archived: bool,
|
|
||||||
|
|
||||||
pub walreceiver_status: String,
|
pub walreceiver_status: String,
|
||||||
|
|
||||||
@@ -1061,7 +1062,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A GetPage request contains two LSN values:
|
// In the V2 protocol version, a GetPage request contains two LSN values:
|
||||||
//
|
//
|
||||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||||
@@ -1074,7 +1075,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||||
// request without waiting for 'request_lsn' to arrive.
|
// request without waiting for 'request_lsn' to arrive.
|
||||||
//
|
//
|
||||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||||
// standby to request a page at a particular non-latest LSN, and also include the
|
// standby to request a page at a particular non-latest LSN, and also include the
|
||||||
@@ -1082,11 +1083,15 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
|||||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
||||||
// difference in the responses between V1 and V2.
|
// difference in the responses between V1 and V2.
|
||||||
//
|
//
|
||||||
|
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
||||||
|
// maps the old format requests to the new format.
|
||||||
|
//
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub enum PagestreamProtocolVersion {
|
pub enum PagestreamProtocolVersion {
|
||||||
|
V1,
|
||||||
V2,
|
V2,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1225,17 +1230,36 @@ impl PagestreamFeMessage {
|
|||||||
bytes.into()
|
bytes.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
pub fn parse<R: std::io::Read>(
|
||||||
|
body: &mut R,
|
||||||
|
protocol_version: PagestreamProtocolVersion,
|
||||||
|
) -> anyhow::Result<PagestreamFeMessage> {
|
||||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||||
//
|
//
|
||||||
// TODO: consider using protobuf or serde bincode for less error prone
|
// TODO: consider using protobuf or serde bincode for less error prone
|
||||||
// serialization.
|
// serialization.
|
||||||
let msg_tag = body.read_u8()?;
|
let msg_tag = body.read_u8()?;
|
||||||
|
|
||||||
// these two fields are the same for every request type
|
let (request_lsn, not_modified_since) = match protocol_version {
|
||||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
PagestreamProtocolVersion::V2 => (
|
||||||
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||||
|
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||||
|
),
|
||||||
|
PagestreamProtocolVersion::V1 => {
|
||||||
|
// In the old protocol, each message starts with a boolean 'latest' flag,
|
||||||
|
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
||||||
|
// 'not_modified_since', used in the new protocol version.
|
||||||
|
let latest = body.read_u8()? != 0;
|
||||||
|
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||||
|
if latest {
|
||||||
|
(Lsn::MAX, request_lsn) // get latest version
|
||||||
|
} else {
|
||||||
|
(request_lsn, request_lsn) // get version at specified LSN
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// The rest of the messages are the same between V1 and V2
|
||||||
match msg_tag {
|
match msg_tag {
|
||||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||||
request_lsn,
|
request_lsn,
|
||||||
@@ -1443,7 +1467,9 @@ mod tests {
|
|||||||
];
|
];
|
||||||
for msg in messages {
|
for msg in messages {
|
||||||
let bytes = msg.serialize();
|
let bytes = msg.serialize();
|
||||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
let reconstructed =
|
||||||
|
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
||||||
|
.unwrap();
|
||||||
assert!(msg == reconstructed);
|
assert!(msg == reconstructed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use std::time::SystemTime;
|
use utils::serde_system_time::SystemTime;
|
||||||
use utils::{serde_percent::Percent, serde_system_time};
|
|
||||||
|
|
||||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||||
/// the next tenant.
|
/// the next tenant.
|
||||||
@@ -10,143 +9,19 @@ use utils::{serde_percent::Percent, serde_system_time};
|
|||||||
/// not handle full u64 values properly.
|
/// not handle full u64 values properly.
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
||||||
pub struct PageserverUtilization {
|
pub struct PageserverUtilization {
|
||||||
/// Used disk space (physical, ground truth from statfs())
|
/// Used disk space
|
||||||
#[serde(serialize_with = "ser_saturating_u63")]
|
#[serde(serialize_with = "ser_saturating_u63")]
|
||||||
pub disk_usage_bytes: u64,
|
pub disk_usage_bytes: u64,
|
||||||
/// Free disk space
|
/// Free disk space
|
||||||
#[serde(serialize_with = "ser_saturating_u63")]
|
#[serde(serialize_with = "ser_saturating_u63")]
|
||||||
pub free_space_bytes: u64,
|
pub free_space_bytes: u64,
|
||||||
|
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
|
||||||
/// Wanted disk space, based on the tenant shards currently present on this pageserver: this
|
#[serde(serialize_with = "ser_saturating_u63")]
|
||||||
/// is like disk_usage_bytes, but it is stable and does not change with the cache state of
|
pub utilization_score: u64,
|
||||||
/// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
|
|
||||||
/// there, or may be unrealistically low if the pageserver has attached tenants which haven't
|
|
||||||
/// downloaded layers yet.
|
|
||||||
#[serde(serialize_with = "ser_saturating_u63", default)]
|
|
||||||
pub disk_wanted_bytes: u64,
|
|
||||||
|
|
||||||
// What proportion of total disk space will this pageserver use before it starts evicting data?
|
|
||||||
#[serde(default = "unity_percent")]
|
|
||||||
pub disk_usable_pct: Percent,
|
|
||||||
|
|
||||||
// How many shards are currently on this node?
|
|
||||||
#[serde(default)]
|
|
||||||
pub shard_count: u32,
|
|
||||||
|
|
||||||
// How many shards should this node be able to handle at most?
|
|
||||||
#[serde(default)]
|
|
||||||
pub max_shard_count: u32,
|
|
||||||
|
|
||||||
/// Cached result of [`Self::score`]
|
|
||||||
pub utilization_score: Option<u64>,
|
|
||||||
|
|
||||||
/// When was this snapshot captured, pageserver local time.
|
/// When was this snapshot captured, pageserver local time.
|
||||||
///
|
///
|
||||||
/// Use millis to give confidence that the value is regenerated often enough.
|
/// Use millis to give confidence that the value is regenerated often enough.
|
||||||
pub captured_at: serde_system_time::SystemTime,
|
pub captured_at: SystemTime,
|
||||||
}
|
|
||||||
|
|
||||||
fn unity_percent() -> Percent {
|
|
||||||
Percent::new(0).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub type RawScore = u64;
|
|
||||||
|
|
||||||
impl PageserverUtilization {
|
|
||||||
const UTILIZATION_FULL: u64 = 1000000;
|
|
||||||
|
|
||||||
/// Calculate a utilization score. The result is to be inrepreted as a fraction of
|
|
||||||
/// Self::UTILIZATION_FULL.
|
|
||||||
///
|
|
||||||
/// Lower values are more affine to scheduling more work on this node.
|
|
||||||
/// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
|
|
||||||
/// - 0.0 represents an empty node.
|
|
||||||
/// - Negative values are forbidden
|
|
||||||
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
|
||||||
/// layer eviction.
|
|
||||||
pub fn score(&self) -> RawScore {
|
|
||||||
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
|
||||||
* self.disk_usable_pct.get() as u64)
|
|
||||||
/ 100;
|
|
||||||
let disk_utilization_score =
|
|
||||||
self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;
|
|
||||||
|
|
||||||
let shard_utilization_score =
|
|
||||||
self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
|
|
||||||
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn cached_score(&mut self) -> RawScore {
|
|
||||||
match self.utilization_score {
|
|
||||||
None => {
|
|
||||||
let s = self.score();
|
|
||||||
self.utilization_score = Some(s);
|
|
||||||
s
|
|
||||||
}
|
|
||||||
Some(s) => s,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
|
||||||
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
|
||||||
pub fn is_overloaded(score: RawScore) -> bool {
|
|
||||||
score >= Self::UTILIZATION_FULL
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
|
||||||
if self.shard_count < shard_count {
|
|
||||||
self.shard_count = shard_count;
|
|
||||||
|
|
||||||
// Dirty cache: this will be calculated next time someone retrives the score
|
|
||||||
self.utilization_score = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
|
||||||
/// you need a utilization but don't have real values yet.
|
|
||||||
pub fn full() -> Self {
|
|
||||||
Self {
|
|
||||||
disk_usage_bytes: 1,
|
|
||||||
free_space_bytes: 0,
|
|
||||||
disk_wanted_bytes: 1,
|
|
||||||
disk_usable_pct: Percent::new(100).unwrap(),
|
|
||||||
shard_count: 1,
|
|
||||||
max_shard_count: 1,
|
|
||||||
utilization_score: Some(Self::UTILIZATION_FULL),
|
|
||||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Test helper
|
|
||||||
pub mod test_utilization {
|
|
||||||
use super::PageserverUtilization;
|
|
||||||
use std::time::SystemTime;
|
|
||||||
use utils::{
|
|
||||||
serde_percent::Percent,
|
|
||||||
serde_system_time::{self},
|
|
||||||
};
|
|
||||||
|
|
||||||
// Parameters of the imaginary node used for test utilization instances
|
|
||||||
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
|
|
||||||
const TEST_SHARDS_MAX: u32 = 1000;
|
|
||||||
|
|
||||||
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
|
|
||||||
/// not abuse this function from non-test code.
|
|
||||||
///
|
|
||||||
/// Emulates a node with a 1000 shard limit and a 1TB disk.
|
|
||||||
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
|
|
||||||
PageserverUtilization {
|
|
||||||
disk_usage_bytes: disk_wanted_bytes,
|
|
||||||
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
|
|
||||||
disk_wanted_bytes,
|
|
||||||
disk_usable_pct: Percent::new(100).unwrap(),
|
|
||||||
shard_count,
|
|
||||||
max_shard_count: TEST_SHARDS_MAX,
|
|
||||||
utilization_score: None,
|
|
||||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||||
@@ -174,19 +49,15 @@ mod tests {
|
|||||||
let doc = PageserverUtilization {
|
let doc = PageserverUtilization {
|
||||||
disk_usage_bytes: u64::MAX,
|
disk_usage_bytes: u64::MAX,
|
||||||
free_space_bytes: 0,
|
free_space_bytes: 0,
|
||||||
disk_wanted_bytes: u64::MAX,
|
utilization_score: u64::MAX,
|
||||||
utilization_score: Some(13),
|
captured_at: SystemTime(
|
||||||
disk_usable_pct: Percent::new(90).unwrap(),
|
|
||||||
shard_count: 100,
|
|
||||||
max_shard_count: 200,
|
|
||||||
captured_at: serde_system_time::SystemTime(
|
|
||||||
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||||
),
|
),
|
||||||
};
|
};
|
||||||
|
|
||||||
let s = serde_json::to_string(&doc).unwrap();
|
let s = serde_json::to_string(&doc).unwrap();
|
||||||
|
|
||||||
let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";
|
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
|
||||||
|
|
||||||
assert_eq!(s, expected);
|
assert_eq!(s, expected);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
|
|||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
|
|
||||||
pq_proto.workspace = true
|
pq_proto.workspace = true
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
|
|||||||
@@ -11,5 +11,7 @@ postgres.workspace = true
|
|||||||
tokio-postgres.workspace = true
|
tokio-postgres.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ thiserror.workspace = true
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
postgres.workspace = true
|
postgres.workspace = true
|
||||||
|
|||||||
@@ -136,15 +136,15 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
|||||||
|
|
||||||
// Export some version independent functions that are used outside of this mod
|
// Export some version independent functions that are used outside of this mod
|
||||||
pub use v14::xlog_utils::encode_logical_message;
|
pub use v14::xlog_utils::encode_logical_message;
|
||||||
|
pub use v14::xlog_utils::from_pg_timestamp;
|
||||||
pub use v14::xlog_utils::get_current_timestamp;
|
pub use v14::xlog_utils::get_current_timestamp;
|
||||||
pub use v14::xlog_utils::to_pg_timestamp;
|
pub use v14::xlog_utils::to_pg_timestamp;
|
||||||
pub use v14::xlog_utils::try_from_pg_timestamp;
|
|
||||||
pub use v14::xlog_utils::XLogFileName;
|
pub use v14::xlog_utils::XLogFileName;
|
||||||
|
|
||||||
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||||
|
|
||||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
|
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||||
dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
|
dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_wal_segment(
|
pub fn generate_wal_segment(
|
||||||
|
|||||||
@@ -135,8 +135,6 @@ pub fn get_current_timestamp() -> TimestampTz {
|
|||||||
mod timestamp_conversions {
|
mod timestamp_conversions {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
use anyhow::Context;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
||||||
@@ -156,18 +154,18 @@ mod timestamp_conversions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
|
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
|
||||||
let time: u64 = time
|
let time: u64 = time
|
||||||
.try_into()
|
.try_into()
|
||||||
.context("timestamp before millenium (postgres epoch)")?;
|
.expect("timestamp before millenium (postgres epoch)");
|
||||||
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
||||||
SystemTime::UNIX_EPOCH
|
SystemTime::UNIX_EPOCH
|
||||||
.checked_add(Duration::from_micros(since_unix_epoch))
|
.checked_add(Duration::from_micros(since_unix_epoch))
|
||||||
.context("SystemTime overflow")
|
.expect("SystemTime overflow")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
|
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
|
||||||
|
|
||||||
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
||||||
// start_lsn must point to some previously known record boundary (beginning of
|
// start_lsn must point to some previously known record boundary (beginning of
|
||||||
@@ -547,14 +545,14 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_ts_conversion() {
|
fn test_ts_conversion() {
|
||||||
let now = SystemTime::now();
|
let now = SystemTime::now();
|
||||||
let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
|
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
|
||||||
|
|
||||||
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||||
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||||
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
||||||
|
|
||||||
let now_pg = get_current_timestamp();
|
let now_pg = get_current_timestamp();
|
||||||
let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
|
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
|
||||||
|
|
||||||
assert_eq!(now_pg, round_trip_pg);
|
assert_eq!(now_pg, round_trip_pg);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,8 @@ postgres.workspace = true
|
|||||||
postgres_ffi.workspace = true
|
postgres_ffi.workspace = true
|
||||||
camino-tempfile.workspace = true
|
camino-tempfile.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
|
|||||||
@@ -11,7 +11,9 @@ itertools.workspace = true
|
|||||||
pin-project-lite.workspace = true
|
pin-project-lite.workspace = true
|
||||||
postgres-protocol.workspace = true
|
postgres-protocol.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
tokio = { workspace = true, features = ["io-util"] }
|
tokio.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ scopeguard.workspace = true
|
|||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
pin-project-lite.workspace = true
|
pin-project-lite.workspace = true
|
||||||
|
workspace_hack.workspace = true
|
||||||
azure_core.workspace = true
|
azure_core.workspace = true
|
||||||
azure_identity.workspace = true
|
azure_identity.workspace = true
|
||||||
azure_storage.workspace = true
|
azure_storage.workspace = true
|
||||||
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
|
|||||||
camino-tempfile.workspace = true
|
camino-tempfile.workspace = true
|
||||||
test-context.workspace = true
|
test-context.workspace = true
|
||||||
rand.workspace = true
|
rand.workspace = true
|
||||||
tokio = { workspace = true, features = ["test-util"] }
|
|
||||||
|
|||||||
@@ -383,48 +383,6 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<ListingObject, DownloadError> {
|
|
||||||
let kind = RequestKind::Head;
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
|
||||||
|
|
||||||
let started_at = start_measuring_requests(kind);
|
|
||||||
|
|
||||||
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
|
|
||||||
let properties_future = blob_client.get_properties().into_future();
|
|
||||||
|
|
||||||
let properties_future = tokio::time::timeout(self.timeout, properties_future);
|
|
||||||
|
|
||||||
let res = tokio::select! {
|
|
||||||
res = properties_future => res,
|
|
||||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Ok(inner) = &res {
|
|
||||||
// do not incl. timeouts as errors in metrics but cancellations
|
|
||||||
let started_at = ScopeGuard::into_inner(started_at);
|
|
||||||
crate::metrics::BUCKET_METRICS
|
|
||||||
.req_seconds
|
|
||||||
.observe_elapsed(kind, inner, started_at);
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = match res {
|
|
||||||
Ok(Ok(data)) => Ok(data),
|
|
||||||
Ok(Err(sdk)) => Err(to_download_error(sdk)),
|
|
||||||
Err(_timeout) => Err(DownloadError::Timeout),
|
|
||||||
}?;
|
|
||||||
|
|
||||||
let properties = data.blob.properties;
|
|
||||||
Ok(ListingObject {
|
|
||||||
key: key.to_owned(),
|
|
||||||
last_modified: SystemTime::from(properties.last_modified),
|
|
||||||
size: properties.content_length,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
|||||||
@@ -42,10 +42,6 @@ impl DownloadError {
|
|||||||
Timeout | Other(_) => false,
|
Timeout | Other(_) => false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_cancelled(&self) -> bool {
|
|
||||||
matches!(self, DownloadError::Cancelled)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<std::io::Error> for DownloadError {
|
impl From<std::io::Error> for DownloadError {
|
||||||
|
|||||||
@@ -150,7 +150,7 @@ pub enum ListingMode {
|
|||||||
NoDelimiter,
|
NoDelimiter,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
#[derive(PartialEq, Eq, Debug)]
|
||||||
pub struct ListingObject {
|
pub struct ListingObject {
|
||||||
pub key: RemotePath,
|
pub key: RemotePath,
|
||||||
pub last_modified: SystemTime,
|
pub last_modified: SystemTime,
|
||||||
@@ -215,13 +215,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
Ok(combined)
|
Ok(combined)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Obtain metadata information about an object.
|
|
||||||
async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<ListingObject, DownloadError>;
|
|
||||||
|
|
||||||
/// Streams the local file contents into remote into the remote storage entry.
|
/// Streams the local file contents into remote into the remote storage entry.
|
||||||
///
|
///
|
||||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||||
@@ -370,20 +363,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// See [`RemoteStorage::head_object`].
|
|
||||||
pub async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<ListingObject, DownloadError> {
|
|
||||||
match self {
|
|
||||||
Self::LocalFs(s) => s.head_object(key, cancel).await,
|
|
||||||
Self::AwsS3(s) => s.head_object(key, cancel).await,
|
|
||||||
Self::AzureBlob(s) => s.head_object(key, cancel).await,
|
|
||||||
Self::Unreliable(s) => s.head_object(key, cancel).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// See [`RemoteStorage::upload`]
|
/// See [`RemoteStorage::upload`]
|
||||||
pub async fn upload(
|
pub async fn upload(
|
||||||
&self,
|
&self,
|
||||||
@@ -619,7 +598,6 @@ impl ConcurrencyLimiter {
|
|||||||
RequestKind::Delete => &self.write,
|
RequestKind::Delete => &self.write,
|
||||||
RequestKind::Copy => &self.write,
|
RequestKind::Copy => &self.write,
|
||||||
RequestKind::TimeTravel => &self.write,
|
RequestKind::TimeTravel => &self.write,
|
||||||
RequestKind::Head => &self.read,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -445,20 +445,6 @@ impl RemoteStorage for LocalFs {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
_cancel: &CancellationToken,
|
|
||||||
) -> Result<ListingObject, DownloadError> {
|
|
||||||
let target_file_path = key.with_base(&self.storage_root);
|
|
||||||
let metadata = file_metadata(&target_file_path).await?;
|
|
||||||
Ok(ListingObject {
|
|
||||||
key: key.clone(),
|
|
||||||
last_modified: metadata.modified()?,
|
|
||||||
size: metadata.len(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ pub(crate) enum RequestKind {
|
|||||||
List = 3,
|
List = 3,
|
||||||
Copy = 4,
|
Copy = 4,
|
||||||
TimeTravel = 5,
|
TimeTravel = 5,
|
||||||
Head = 6,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
use scopeguard::ScopeGuard;
|
use scopeguard::ScopeGuard;
|
||||||
@@ -28,7 +27,6 @@ impl RequestKind {
|
|||||||
List => "list_objects",
|
List => "list_objects",
|
||||||
Copy => "copy_object",
|
Copy => "copy_object",
|
||||||
TimeTravel => "time_travel_recover",
|
TimeTravel => "time_travel_recover",
|
||||||
Head => "head_object",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const fn as_index(&self) -> usize {
|
const fn as_index(&self) -> usize {
|
||||||
@@ -36,8 +34,7 @@ impl RequestKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const REQUEST_KIND_COUNT: usize = 7;
|
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||||
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
|
|
||||||
|
|
||||||
impl<C> RequestTyped<C> {
|
impl<C> RequestTyped<C> {
|
||||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||||
@@ -46,8 +43,8 @@ impl<C> RequestTyped<C> {
|
|||||||
|
|
||||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||||
use RequestKind::*;
|
use RequestKind::*;
|
||||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
|
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||||
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
|
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||||
let next = it.next().unwrap();
|
let next = it.next().unwrap();
|
||||||
assert_eq!(index, next.as_index());
|
assert_eq!(index, next.as_index());
|
||||||
f(next)
|
f(next)
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ use aws_config::{
|
|||||||
use aws_sdk_s3::{
|
use aws_sdk_s3::{
|
||||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||||
error::SdkError,
|
error::SdkError,
|
||||||
operation::{get_object::GetObjectError, head_object::HeadObjectError},
|
operation::get_object::GetObjectError,
|
||||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||||
Client,
|
Client,
|
||||||
};
|
};
|
||||||
@@ -604,78 +604,6 @@ impl RemoteStorage for S3Bucket {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<ListingObject, DownloadError> {
|
|
||||||
let kind = RequestKind::Head;
|
|
||||||
let _permit = self.permit(kind, cancel).await?;
|
|
||||||
|
|
||||||
let started_at = start_measuring_requests(kind);
|
|
||||||
|
|
||||||
let head_future = self
|
|
||||||
.client
|
|
||||||
.head_object()
|
|
||||||
.bucket(self.bucket_name())
|
|
||||||
.key(self.relative_path_to_s3_object(key))
|
|
||||||
.send();
|
|
||||||
|
|
||||||
let head_future = tokio::time::timeout(self.timeout, head_future);
|
|
||||||
|
|
||||||
let res = tokio::select! {
|
|
||||||
res = head_future => res,
|
|
||||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let res = res.map_err(|_e| DownloadError::Timeout)?;
|
|
||||||
|
|
||||||
// do not incl. timeouts as errors in metrics but cancellations
|
|
||||||
let started_at = ScopeGuard::into_inner(started_at);
|
|
||||||
crate::metrics::BUCKET_METRICS
|
|
||||||
.req_seconds
|
|
||||||
.observe_elapsed(kind, &res, started_at);
|
|
||||||
|
|
||||||
let data = match res {
|
|
||||||
Ok(object_output) => object_output,
|
|
||||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
|
|
||||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
|
||||||
// an error: we expect to sometimes fetch an object and find it missing,
|
|
||||||
// e.g. when probing for timeline indices.
|
|
||||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
|
||||||
kind,
|
|
||||||
AttemptOutcome::Ok,
|
|
||||||
started_at,
|
|
||||||
);
|
|
||||||
return Err(DownloadError::NotFound);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
|
||||||
kind,
|
|
||||||
AttemptOutcome::Err,
|
|
||||||
started_at,
|
|
||||||
);
|
|
||||||
|
|
||||||
return Err(DownloadError::Other(
|
|
||||||
anyhow::Error::new(e).context("s3 head object"),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
|
|
||||||
return Err(DownloadError::Other(anyhow!(
|
|
||||||
"head_object doesn't contain last_modified or content_length"
|
|
||||||
)))?;
|
|
||||||
};
|
|
||||||
Ok(ListingObject {
|
|
||||||
key: key.to_owned(),
|
|
||||||
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
|
|
||||||
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
|
|
||||||
})?,
|
|
||||||
size: size as u64,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ pub struct UnreliableWrapper {
|
|||||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||||
enum RemoteOp {
|
enum RemoteOp {
|
||||||
ListPrefixes(Option<RemotePath>),
|
ListPrefixes(Option<RemotePath>),
|
||||||
HeadObject(RemotePath),
|
|
||||||
Upload(RemotePath),
|
Upload(RemotePath),
|
||||||
Download(RemotePath),
|
Download(RemotePath),
|
||||||
Delete(RemotePath),
|
Delete(RemotePath),
|
||||||
@@ -138,16 +137,6 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn head_object(
|
|
||||||
&self,
|
|
||||||
key: &RemotePath,
|
|
||||||
cancel: &CancellationToken,
|
|
||||||
) -> Result<crate::ListingObject, DownloadError> {
|
|
||||||
self.attempt(RemoteOp::HeadObject(key.clone()))
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
self.inner.head_object(key, cancel).await
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
&self,
|
&self,
|
||||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||||
|
|||||||
@@ -9,3 +9,5 @@ serde.workspace = true
|
|||||||
serde_with.workspace = true
|
serde_with.workspace = true
|
||||||
const_format.workspace = true
|
const_format.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -9,3 +9,5 @@ license.workspace = true
|
|||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
|||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-opentelemetry.workspace = true
|
tracing-opentelemetry.workspace = true
|
||||||
tracing-subscriber.workspace = true
|
tracing-subscriber.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ testing = ["fail/failpoints"]
|
|||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
|
async-trait.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
bincode.workspace = true
|
bincode.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
@@ -25,6 +26,7 @@ hyper = { workspace = true, features = ["full"] }
|
|||||||
fail.workspace = true
|
fail.workspace = true
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
jsonwebtoken.workspace = true
|
jsonwebtoken.workspace = true
|
||||||
|
leaky-bucket.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
once_cell.workspace = true
|
once_cell.workspace = true
|
||||||
pin-project-lite.workspace = true
|
pin-project-lite.workspace = true
|
||||||
@@ -37,7 +39,7 @@ thiserror.workspace = true
|
|||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tokio-tar.workspace = true
|
tokio-tar.workspace = true
|
||||||
tokio-util.workspace = true
|
tokio-util.workspace = true
|
||||||
toml_edit = { workspace = true, features = ["serde"] }
|
toml_edit.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
tracing-error.workspace = true
|
tracing-error.workspace = true
|
||||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||||
@@ -52,6 +54,7 @@ walkdir.workspace = true
|
|||||||
pq_proto.workspace = true
|
pq_proto.workspace = true
|
||||||
postgres_connection.workspace = true
|
postgres_connection.workspace = true
|
||||||
metrics.workspace = true
|
metrics.workspace = true
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
const_format.workspace = true
|
const_format.workspace = true
|
||||||
|
|
||||||
@@ -68,7 +71,6 @@ criterion.workspace = true
|
|||||||
hex-literal.workspace = true
|
hex-literal.workspace = true
|
||||||
camino-tempfile.workspace = true
|
camino-tempfile.workspace = true
|
||||||
serde_assert.workspace = true
|
serde_assert.workspace = true
|
||||||
tokio = { workspace = true, features = ["test-util"] }
|
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
|
|||||||
@@ -5,40 +5,13 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
|
|||||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Completion {
|
pub struct Completion {
|
||||||
token: TaskTrackerToken,
|
_token: TaskTrackerToken,
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Debug for Completion {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("Completion")
|
|
||||||
.field("siblings", &self.token.task_tracker().len())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Completion {
|
|
||||||
/// Returns true if this completion is associated with the given barrier.
|
|
||||||
pub fn blocks(&self, barrier: &Barrier) -> bool {
|
|
||||||
TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn barrier(&self) -> Barrier {
|
|
||||||
Barrier(self.token.task_tracker().clone())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct Barrier(TaskTracker);
|
pub struct Barrier(TaskTracker);
|
||||||
|
|
||||||
impl std::fmt::Debug for Barrier {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
f.debug_struct("Barrier")
|
|
||||||
.field("remaining", &self.0.len())
|
|
||||||
.finish()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Barrier {
|
impl Default for Barrier {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
let (_, rx) = channel();
|
let (_, rx) = channel();
|
||||||
@@ -78,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
|
|||||||
tracker.close();
|
tracker.close();
|
||||||
|
|
||||||
let token = tracker.token();
|
let token = tracker.token();
|
||||||
(Completion { token }, Barrier(tracker))
|
(Completion { _token: token }, Barrier(tracker))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,280 +0,0 @@
|
|||||||
//! This module implements the Generic Cell Rate Algorithm for a simplified
|
|
||||||
//! version of the Leaky Bucket rate limiting system.
|
|
||||||
//!
|
|
||||||
//! # Leaky Bucket
|
|
||||||
//!
|
|
||||||
//! If the bucket is full, no new requests are allowed and are throttled/errored.
|
|
||||||
//! If the bucket is partially full/empty, new requests are added to the bucket in
|
|
||||||
//! terms of "tokens".
|
|
||||||
//!
|
|
||||||
//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
|
|
||||||
//!
|
|
||||||
//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
|
|
||||||
//!
|
|
||||||
//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
|
|
||||||
//!
|
|
||||||
//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
|
|
||||||
//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
|
|
||||||
//!
|
|
||||||
//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
|
|
||||||
//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
|
|
||||||
//!
|
|
||||||
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
|
|
||||||
|
|
||||||
use std::{sync::Mutex, time::Duration};
|
|
||||||
|
|
||||||
use tokio::{sync::Notify, time::Instant};
|
|
||||||
|
|
||||||
pub struct LeakyBucketConfig {
|
|
||||||
/// This is the "time cost" of a single request unit.
|
|
||||||
/// Should loosely represent how long it takes to handle a request unit in active resource time.
|
|
||||||
/// Loosely speaking this is the inverse of the steady-rate requests-per-second
|
|
||||||
pub cost: Duration,
|
|
||||||
|
|
||||||
/// total size of the bucket
|
|
||||||
pub bucket_width: Duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LeakyBucketConfig {
|
|
||||||
pub fn new(rps: f64, bucket_size: f64) -> Self {
|
|
||||||
let cost = Duration::from_secs_f64(rps.recip());
|
|
||||||
let bucket_width = cost.mul_f64(bucket_size);
|
|
||||||
Self { cost, bucket_width }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct LeakyBucketState {
|
|
||||||
/// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
|
|
||||||
///
|
|
||||||
/// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
|
|
||||||
/// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
|
|
||||||
/// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
|
|
||||||
/// Draining the bucket will happen naturally as `now` moves forward.
|
|
||||||
///
|
|
||||||
/// Let `n` be some "time cost" for the request,
|
|
||||||
/// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
|
|
||||||
/// If now is within the `bucket window + n`, we are within time budget.
|
|
||||||
/// If now is before the `bucket window + n`, we have run out of budget.
|
|
||||||
///
|
|
||||||
/// This is inspired by the generic cell rate algorithm (GCRA) and works
|
|
||||||
/// exactly the same as a leaky-bucket.
|
|
||||||
pub empty_at: Instant,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LeakyBucketState {
|
|
||||||
pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
|
|
||||||
LeakyBucketState {
|
|
||||||
empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn bucket_is_empty(&self, now: Instant) -> bool {
|
|
||||||
// if self.end is after now, the bucket is not empty
|
|
||||||
self.empty_at <= now
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Immediately adds tokens to the bucket, if there is space.
|
|
||||||
///
|
|
||||||
/// In a scenario where you are waiting for available rate,
|
|
||||||
/// rather than just erroring immediately, `started` corresponds to when this waiting started.
|
|
||||||
///
|
|
||||||
/// `n` is the number of tokens that will be filled in the bucket.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
///
|
|
||||||
/// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
|
|
||||||
/// there will be space again.
|
|
||||||
pub fn add_tokens(
|
|
||||||
&mut self,
|
|
||||||
config: &LeakyBucketConfig,
|
|
||||||
started: Instant,
|
|
||||||
n: f64,
|
|
||||||
) -> Result<(), Instant> {
|
|
||||||
let now = Instant::now();
|
|
||||||
|
|
||||||
// invariant: started <= now
|
|
||||||
debug_assert!(started <= now);
|
|
||||||
|
|
||||||
// If the bucket was empty when we started our search,
|
|
||||||
// we should update the `empty_at` value accordingly.
|
|
||||||
// this prevents us from having negative tokens in the bucket.
|
|
||||||
let mut empty_at = self.empty_at;
|
|
||||||
if empty_at < started {
|
|
||||||
empty_at = started;
|
|
||||||
}
|
|
||||||
|
|
||||||
let n = config.cost.mul_f64(n);
|
|
||||||
let new_empty_at = empty_at + n;
|
|
||||||
let allow_at = new_empty_at.checked_sub(config.bucket_width);
|
|
||||||
|
|
||||||
// empty_at
|
|
||||||
// allow_at | new_empty_at
|
|
||||||
// / | /
|
|
||||||
// -------o-[---------o-|--]---------
|
|
||||||
// now1 ^ now2 ^
|
|
||||||
//
|
|
||||||
// at now1, the bucket would be completely filled if we add n tokens.
|
|
||||||
// at now2, the bucket would be partially filled if we add n tokens.
|
|
||||||
|
|
||||||
match allow_at {
|
|
||||||
Some(allow_at) if now < allow_at => Err(allow_at),
|
|
||||||
_ => {
|
|
||||||
self.empty_at = new_empty_at;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct RateLimiter {
|
|
||||||
pub config: LeakyBucketConfig,
|
|
||||||
pub state: Mutex<LeakyBucketState>,
|
|
||||||
/// a queue to provide this fair ordering.
|
|
||||||
pub queue: Notify,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Requeue<'a>(&'a Notify);
|
|
||||||
|
|
||||||
impl Drop for Requeue<'_> {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.0.notify_one();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RateLimiter {
|
|
||||||
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
|
|
||||||
RateLimiter {
|
|
||||||
state: Mutex::new(LeakyBucketState::with_initial_tokens(
|
|
||||||
&config,
|
|
||||||
initial_tokens,
|
|
||||||
)),
|
|
||||||
config,
|
|
||||||
queue: {
|
|
||||||
let queue = Notify::new();
|
|
||||||
queue.notify_one();
|
|
||||||
queue
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn steady_rps(&self) -> f64 {
|
|
||||||
self.config.cost.as_secs_f64().recip()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// returns true if we did throttle
|
|
||||||
pub async fn acquire(&self, count: usize) -> bool {
|
|
||||||
let mut throttled = false;
|
|
||||||
|
|
||||||
let start = tokio::time::Instant::now();
|
|
||||||
|
|
||||||
// wait until we are the first in the queue
|
|
||||||
let mut notified = std::pin::pin!(self.queue.notified());
|
|
||||||
if !notified.as_mut().enable() {
|
|
||||||
throttled = true;
|
|
||||||
notified.await;
|
|
||||||
}
|
|
||||||
|
|
||||||
// notify the next waiter in the queue when we are done.
|
|
||||||
let _guard = Requeue(&self.queue);
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let res = self
|
|
||||||
.state
|
|
||||||
.lock()
|
|
||||||
.unwrap()
|
|
||||||
.add_tokens(&self.config, start, count as f64);
|
|
||||||
match res {
|
|
||||||
Ok(()) => return throttled,
|
|
||||||
Err(ready_at) => {
|
|
||||||
throttled = true;
|
|
||||||
tokio::time::sleep_until(ready_at).await;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
use tokio::time::Instant;
|
|
||||||
|
|
||||||
use super::{LeakyBucketConfig, LeakyBucketState};
|
|
||||||
|
|
||||||
#[tokio::test(start_paused = true)]
|
|
||||||
async fn check() {
|
|
||||||
let config = LeakyBucketConfig {
|
|
||||||
// average 100rps
|
|
||||||
cost: Duration::from_millis(10),
|
|
||||||
// burst up to 100 requests
|
|
||||||
bucket_width: Duration::from_millis(1000),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut state = LeakyBucketState {
|
|
||||||
empty_at: Instant::now(),
|
|
||||||
};
|
|
||||||
|
|
||||||
// supports burst
|
|
||||||
{
|
|
||||||
// should work for 100 requests this instant
|
|
||||||
for _ in 0..100 {
|
|
||||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
|
||||||
}
|
|
||||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
|
||||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
|
||||||
}
|
|
||||||
|
|
||||||
// doesn't overfill
|
|
||||||
{
|
|
||||||
// after 1s we should have an empty bucket again.
|
|
||||||
tokio::time::advance(Duration::from_secs(1)).await;
|
|
||||||
assert!(state.bucket_is_empty(Instant::now()));
|
|
||||||
|
|
||||||
// after 1s more, we should not over count the tokens and allow more than 200 requests.
|
|
||||||
tokio::time::advance(Duration::from_secs(1)).await;
|
|
||||||
for _ in 0..100 {
|
|
||||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
|
||||||
}
|
|
||||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
|
||||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
|
||||||
}
|
|
||||||
|
|
||||||
// supports sustained rate over a long period
|
|
||||||
{
|
|
||||||
tokio::time::advance(Duration::from_secs(1)).await;
|
|
||||||
|
|
||||||
// should sustain 100rps
|
|
||||||
for _ in 0..2000 {
|
|
||||||
tokio::time::advance(Duration::from_millis(10)).await;
|
|
||||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// supports requesting more tokens than can be stored in the bucket
|
|
||||||
// we just wait a little bit longer upfront.
|
|
||||||
{
|
|
||||||
// start the bucket completely empty
|
|
||||||
tokio::time::advance(Duration::from_secs(5)).await;
|
|
||||||
assert!(state.bucket_is_empty(Instant::now()));
|
|
||||||
|
|
||||||
// requesting 200 tokens of space should take 200*cost = 2s
|
|
||||||
// but we already have 1s available, so we wait 1s from start.
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
|
||||||
assert_eq!(ready - Instant::now(), Duration::from_secs(1));
|
|
||||||
|
|
||||||
tokio::time::advance(Duration::from_millis(500)).await;
|
|
||||||
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
|
||||||
assert_eq!(ready - Instant::now(), Duration::from_millis(500));
|
|
||||||
|
|
||||||
tokio::time::advance(Duration::from_millis(500)).await;
|
|
||||||
state.add_tokens(&config, start, 200.0).unwrap();
|
|
||||||
|
|
||||||
// bucket should be completely full now
|
|
||||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
|
||||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -71,7 +71,6 @@ pub mod postgres_client;
|
|||||||
|
|
||||||
pub mod tracing_span_assert;
|
pub mod tracing_span_assert;
|
||||||
|
|
||||||
pub mod leaky_bucket;
|
|
||||||
pub mod rate_limit;
|
pub mod rate_limit;
|
||||||
|
|
||||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||||
|
|||||||
@@ -5,15 +5,6 @@ use std::time::{Duration, Instant};
|
|||||||
pub struct RateLimit {
|
pub struct RateLimit {
|
||||||
last: Option<Instant>,
|
last: Option<Instant>,
|
||||||
interval: Duration,
|
interval: Duration,
|
||||||
dropped: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct RateLimitStats(u64);
|
|
||||||
|
|
||||||
impl std::fmt::Display for RateLimitStats {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
|
||||||
write!(f, "{} dropped calls", self.0)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RateLimit {
|
impl RateLimit {
|
||||||
@@ -21,27 +12,20 @@ impl RateLimit {
|
|||||||
Self {
|
Self {
|
||||||
last: None,
|
last: None,
|
||||||
interval,
|
interval,
|
||||||
dropped: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Call `f` if the rate limit allows.
|
/// Call `f` if the rate limit allows.
|
||||||
/// Don't call it otherwise.
|
/// Don't call it otherwise.
|
||||||
pub fn call<F: FnOnce()>(&mut self, f: F) {
|
pub fn call<F: FnOnce()>(&mut self, f: F) {
|
||||||
self.call2(|_| f())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
|
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
match self.last {
|
match self.last {
|
||||||
Some(last) if now - last <= self.interval => {
|
Some(last) if now - last <= self.interval => {
|
||||||
// ratelimit
|
// ratelimit
|
||||||
self.dropped += 1;
|
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
self.last = Some(now);
|
self.last = Some(now);
|
||||||
f(RateLimitStats(self.dropped));
|
f();
|
||||||
self.dropped = 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ anyhow.workspace = true
|
|||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
postgres_ffi.workspace = true
|
postgres_ffi.workspace = true
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
bindgen.workspace = true
|
bindgen.workspace = true
|
||||||
|
|||||||
@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.allowlist_var("ERROR")
|
.allowlist_var("ERROR")
|
||||||
.allowlist_var("FATAL")
|
.allowlist_var("FATAL")
|
||||||
.allowlist_var("PANIC")
|
.allowlist_var("PANIC")
|
||||||
.allowlist_var("PG_VERSION_NUM")
|
|
||||||
.allowlist_var("WPEVENT")
|
.allowlist_var("WPEVENT")
|
||||||
.allowlist_var("WL_LATCH_SET")
|
.allowlist_var("WL_LATCH_SET")
|
||||||
.allowlist_var("WL_SOCKET_READABLE")
|
.allowlist_var("WL_SOCKET_READABLE")
|
||||||
|
|||||||
@@ -282,11 +282,7 @@ mod tests {
|
|||||||
use std::cell::UnsafeCell;
|
use std::cell::UnsafeCell;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
|
|
||||||
use crate::{
|
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||||
api_bindings::Level,
|
|
||||||
bindings::{NeonWALReadResult, PG_VERSION_NUM},
|
|
||||||
walproposer::Wrapper,
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::ApiImpl;
|
use super::ApiImpl;
|
||||||
|
|
||||||
@@ -493,79 +489,41 @@ mod tests {
|
|||||||
|
|
||||||
let (sender, receiver) = sync_channel(1);
|
let (sender, receiver) = sync_channel(1);
|
||||||
|
|
||||||
// Messages definitions are at walproposer.h
|
|
||||||
// xxx: it would be better to extract them from safekeeper crate and
|
|
||||||
// use serialization/deserialization here.
|
|
||||||
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
|
||||||
let proto_version = 2_u32.to_ne_bytes();
|
|
||||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
|
||||||
let proposer_id = [0; 16];
|
|
||||||
let system_id = 0_u64.to_ne_bytes();
|
|
||||||
let tenant_id = ttid.tenant_id.as_arr();
|
|
||||||
let timeline_id = ttid.timeline_id.as_arr();
|
|
||||||
let pg_tli = 1_u32.to_ne_bytes();
|
|
||||||
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
|
||||||
let proposer_greeting = [
|
|
||||||
greeting_tag.as_slice(),
|
|
||||||
proto_version.as_slice(),
|
|
||||||
pg_version.as_slice(),
|
|
||||||
proposer_id.as_slice(),
|
|
||||||
system_id.as_slice(),
|
|
||||||
tenant_id.as_slice(),
|
|
||||||
timeline_id.as_slice(),
|
|
||||||
pg_tli.as_slice(),
|
|
||||||
wal_seg_size.as_slice(),
|
|
||||||
]
|
|
||||||
.concat();
|
|
||||||
|
|
||||||
let voting_tag = (b'v' as u64).to_ne_bytes();
|
|
||||||
let vote_request_term = 3_u64.to_ne_bytes();
|
|
||||||
let proposer_id = [0; 16];
|
|
||||||
let vote_request = [
|
|
||||||
voting_tag.as_slice(),
|
|
||||||
vote_request_term.as_slice(),
|
|
||||||
proposer_id.as_slice(),
|
|
||||||
]
|
|
||||||
.concat();
|
|
||||||
|
|
||||||
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
|
||||||
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
|
||||||
let acceptor_greeting = [
|
|
||||||
greeting_tag.as_slice(),
|
|
||||||
acceptor_greeting_term.as_slice(),
|
|
||||||
acceptor_greeting_node_id.as_slice(),
|
|
||||||
]
|
|
||||||
.concat();
|
|
||||||
|
|
||||||
let vote_response_term = 3_u64.to_ne_bytes();
|
|
||||||
let vote_given = 1_u64.to_ne_bytes();
|
|
||||||
let flush_lsn = 0x539_u64.to_ne_bytes();
|
|
||||||
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
|
||||||
let th_len = 1_u32.to_ne_bytes();
|
|
||||||
let th_term = 2_u64.to_ne_bytes();
|
|
||||||
let th_lsn = 0x539_u64.to_ne_bytes();
|
|
||||||
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
|
||||||
let vote_response = [
|
|
||||||
voting_tag.as_slice(),
|
|
||||||
vote_response_term.as_slice(),
|
|
||||||
vote_given.as_slice(),
|
|
||||||
flush_lsn.as_slice(),
|
|
||||||
truncate_lsn.as_slice(),
|
|
||||||
th_len.as_slice(),
|
|
||||||
th_term.as_slice(),
|
|
||||||
th_lsn.as_slice(),
|
|
||||||
timeline_start_lsn.as_slice(),
|
|
||||||
]
|
|
||||||
.concat();
|
|
||||||
|
|
||||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||||
wait_events: Cell::new(WaitEventsData {
|
wait_events: Cell::new(WaitEventsData {
|
||||||
sk: std::ptr::null_mut(),
|
sk: std::ptr::null_mut(),
|
||||||
event_mask: 0,
|
event_mask: 0,
|
||||||
}),
|
}),
|
||||||
expected_messages: vec![proposer_greeting, vote_request],
|
expected_messages: vec![
|
||||||
|
// TODO: When updating Postgres versions, this test will cause
|
||||||
|
// problems. Postgres version in message needs updating.
|
||||||
|
//
|
||||||
|
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||||
|
vec![
|
||||||
|
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||||
|
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||||
|
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||||
|
],
|
||||||
|
// VoteRequest(VoteRequest { term: 3 })
|
||||||
|
vec![
|
||||||
|
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, 0, 0, 0,
|
||||||
|
],
|
||||||
|
],
|
||||||
expected_ptr: AtomicUsize::new(0),
|
expected_ptr: AtomicUsize::new(0),
|
||||||
safekeeper_replies: vec![acceptor_greeting, vote_response],
|
safekeeper_replies: vec![
|
||||||
|
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||||
|
vec![
|
||||||
|
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
],
|
||||||
|
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||||
|
vec![
|
||||||
|
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||||
|
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||||
|
],
|
||||||
|
],
|
||||||
replies_ptr: AtomicUsize::new(0),
|
replies_ptr: AtomicUsize::new(0),
|
||||||
sync_channel: sender,
|
sync_channel: sender,
|
||||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ arc-swap.workspace = true
|
|||||||
async-compression.workspace = true
|
async-compression.workspace = true
|
||||||
async-stream.workspace = true
|
async-stream.workspace = true
|
||||||
async-trait.workspace = true
|
async-trait.workspace = true
|
||||||
bit_field.workspace = true
|
|
||||||
byteorder.workspace = true
|
byteorder.workspace = true
|
||||||
bytes.workspace = true
|
bytes.workspace = true
|
||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
@@ -37,6 +36,7 @@ humantime.workspace = true
|
|||||||
humantime-serde.workspace = true
|
humantime-serde.workspace = true
|
||||||
hyper.workspace = true
|
hyper.workspace = true
|
||||||
itertools.workspace = true
|
itertools.workspace = true
|
||||||
|
leaky-bucket.workspace = true
|
||||||
md5.workspace = true
|
md5.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
# hack to get the number of worker threads tokio uses
|
# hack to get the number of worker threads tokio uses
|
||||||
@@ -52,7 +52,6 @@ rand.workspace = true
|
|||||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
scopeguard.workspace = true
|
scopeguard.workspace = true
|
||||||
send-future.workspace = true
|
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json = { workspace = true, features = ["raw_value"] }
|
serde_json = { workspace = true, features = ["raw_value"] }
|
||||||
serde_path_to_error.workspace = true
|
serde_path_to_error.workspace = true
|
||||||
|
|||||||
@@ -4,13 +4,12 @@ use bytes::Bytes;
|
|||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use criterion::{criterion_group, criterion_main, Criterion};
|
use criterion::{criterion_group, criterion_main, Criterion};
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
|
config::PageServerConf,
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||||
page_cache,
|
page_cache,
|
||||||
repository::Value,
|
repository::Value,
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
|
||||||
tenant::storage_layer::InMemoryLayer,
|
tenant::storage_layer::InMemoryLayer,
|
||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
@@ -68,16 +67,12 @@ async fn ingest(
|
|||||||
let layer =
|
let layer =
|
||||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||||
|
|
||||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
|
||||||
let ctx = RequestContext::new(
|
let ctx = RequestContext::new(
|
||||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||||
pageserver::context::DownloadBehavior::Download,
|
pageserver::context::DownloadBehavior::Download,
|
||||||
);
|
);
|
||||||
|
|
||||||
const BATCH_SIZE: usize = 16;
|
|
||||||
let mut batch = Vec::new();
|
|
||||||
|
|
||||||
for i in 0..put_count {
|
for i in 0..put_count {
|
||||||
lsn += put_size as u64;
|
lsn += put_size as u64;
|
||||||
|
|
||||||
@@ -100,17 +95,7 @@ async fn ingest(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
layer.put_value(key, lsn, &data, &ctx).await?;
|
||||||
if batch.len() >= BATCH_SIZE {
|
|
||||||
let this_batch = std::mem::take(&mut batch);
|
|
||||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
|
||||||
layer.put_batch(serialized, &ctx).await?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !batch.is_empty() {
|
|
||||||
let this_batch = std::mem::take(&mut batch);
|
|
||||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
|
||||||
layer.put_batch(serialized, &ctx).await?;
|
|
||||||
}
|
}
|
||||||
layer.freeze(lsn + 1).await;
|
layer.freeze(lsn + 1).await;
|
||||||
|
|
||||||
@@ -164,11 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
|
|||||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||||
));
|
));
|
||||||
virtual_file::init(
|
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||||
16384,
|
|
||||||
virtual_file::io_engine_for_bench(),
|
|
||||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
);
|
|
||||||
page_cache::init(conf.page_cache_size);
|
page_cache::init(conf.page_cache_size);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ license.workspace = true
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
pageserver_api.workspace = true
|
pageserver_api.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
async-trait.workspace = true
|
||||||
reqwest = { workspace = true, features = [ "stream" ] }
|
reqwest = { workspace = true, features = [ "stream" ] }
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
|
|||||||
@@ -419,24 +419,6 @@ impl Client {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn timeline_archival_config(
|
|
||||||
&self,
|
|
||||||
tenant_shard_id: TenantShardId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
req: &TimelineArchivalConfigRequest,
|
|
||||||
) -> Result<()> {
|
|
||||||
let uri = format!(
|
|
||||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
|
|
||||||
self.mgmt_api_endpoint
|
|
||||||
);
|
|
||||||
|
|
||||||
self.request(Method::POST, &uri, req)
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn timeline_detach_ancestor(
|
pub async fn timeline_detach_ancestor(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -524,16 +506,6 @@ impl Client {
|
|||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Configs io buffer alignment at runtime.
|
|
||||||
pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
|
|
||||||
let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
|
|
||||||
self.request(Method::PUT, uri, align)
|
|
||||||
.await?
|
|
||||||
.json()
|
|
||||||
.await
|
|
||||||
.map_err(Error::ReceiveBody)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||||
self.get(uri)
|
self.get(uri)
|
||||||
|
|||||||
@@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
|
||||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||||
use pageserver::task_mgr::TaskKind;
|
use pageserver::task_mgr::TaskKind;
|
||||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
@@ -145,11 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
|||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
|
|
||||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||||
pageserver::virtual_file::init(
|
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||||
10,
|
|
||||||
virtual_file::api::IoEngineKind::StdFs,
|
|
||||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
);
|
|
||||||
pageserver::page_cache::init(100);
|
pageserver::page_cache::init(100);
|
||||||
|
|
||||||
let mut total_delta_layers = 0usize;
|
let mut total_delta_layers = 0usize;
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use clap::Subcommand;
|
use clap::Subcommand;
|
||||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
|
||||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||||
use pageserver::task_mgr::TaskKind;
|
use pageserver::task_mgr::TaskKind;
|
||||||
use pageserver::tenant::block_io::BlockCursor;
|
use pageserver::tenant::block_io::BlockCursor;
|
||||||
@@ -60,7 +59,7 @@ pub(crate) enum LayerCmd {
|
|||||||
|
|
||||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
|
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||||
page_cache::init(100);
|
page_cache::init(100);
|
||||||
let file = VirtualFile::open(path, ctx).await?;
|
let file = VirtualFile::open(path, ctx).await?;
|
||||||
let file_id = page_cache::next_file_id();
|
let file_id = page_cache::next_file_id();
|
||||||
@@ -90,7 +89,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
|||||||
for (k, v) in all {
|
for (k, v) in all {
|
||||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||||
println!("key:{} value_len:{}", k, value.len());
|
println!("key:{} value_len:{}", k, value.len());
|
||||||
assert!(k.is_i128_representable(), "invalid key: ");
|
|
||||||
}
|
}
|
||||||
// TODO(chi): special handling for last key?
|
// TODO(chi): special handling for last key?
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -191,11 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
|||||||
new_tenant_id,
|
new_tenant_id,
|
||||||
new_timeline_id,
|
new_timeline_id,
|
||||||
} => {
|
} => {
|
||||||
pageserver::virtual_file::init(
|
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||||
10,
|
|
||||||
virtual_file::api::IoEngineKind::StdFs,
|
|
||||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
);
|
|
||||||
pageserver::page_cache::init(100);
|
pageserver::page_cache::init(100);
|
||||||
|
|
||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ use clap::{Parser, Subcommand};
|
|||||||
use index_part::IndexPartCmd;
|
use index_part::IndexPartCmd;
|
||||||
use layers::LayerCmd;
|
use layers::LayerCmd;
|
||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
context::{DownloadBehavior, RequestContext},
|
context::{DownloadBehavior, RequestContext},
|
||||||
page_cache,
|
page_cache,
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
@@ -206,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
|||||||
|
|
||||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(
|
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||||
10,
|
|
||||||
virtual_file::api::IoEngineKind::StdFs,
|
|
||||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
);
|
|
||||||
page_cache::init(100);
|
page_cache::init(100);
|
||||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||||
dump_layerfile_from_path(path, true, &ctx).await
|
dump_layerfile_from_path(path, true, &ctx).await
|
||||||
|
|||||||
@@ -58,11 +58,6 @@ pub(crate) struct Args {
|
|||||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||||
#[clap(long)]
|
#[clap(long)]
|
||||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||||
|
|
||||||
/// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
|
|
||||||
#[clap(long)]
|
|
||||||
set_io_alignment: Option<usize>,
|
|
||||||
|
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
targets: Option<Vec<TenantTimelineId>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,10 +124,6 @@ async fn main_impl(
|
|||||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(align) = args.set_io_alignment {
|
|
||||||
mgmt_api_client.put_io_alignment(align).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// discover targets
|
// discover targets
|
||||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||||
&mgmt_api_client,
|
&mgmt_api_client,
|
||||||
|
|||||||
@@ -1,39 +0,0 @@
|
|||||||
//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
|
|
||||||
|
|
||||||
pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
|
|
||||||
if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
|
|
||||||
panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
pub(crate) trait U64IsUsize {
|
|
||||||
fn into_usize(self) -> usize;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl U64IsUsize for u64 {
|
|
||||||
#[inline(always)]
|
|
||||||
fn into_usize(self) -> usize {
|
|
||||||
#[allow(clippy::let_unit_value)]
|
|
||||||
let _ = _ASSERT_U64_EQ_USIZE;
|
|
||||||
self as usize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) trait UsizeIsU64 {
|
|
||||||
fn into_u64(self) -> u64;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UsizeIsU64 for usize {
|
|
||||||
#[inline(always)]
|
|
||||||
fn into_u64(self) -> u64 {
|
|
||||||
#[allow(clippy::let_unit_value)]
|
|
||||||
let _ = _ASSERT_U64_EQ_USIZE;
|
|
||||||
self as u64
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub const fn u64_to_usize(x: u64) -> usize {
|
|
||||||
#[allow(clippy::let_unit_value)]
|
|
||||||
let _ = _ASSERT_U64_EQ_USIZE;
|
|
||||||
x as usize
|
|
||||||
}
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
use anyhow;
|
|
||||||
use camino::Utf8PathBuf;
|
|
||||||
use clap::Parser;
|
|
||||||
use pageserver::{pg_import, virtual_file::{self, api::IoEngineKind}};
|
|
||||||
use utils::id::{TenantId, TimelineId};
|
|
||||||
use utils::logging::{self, LogFormat, TracingErrorLayerEnablement};
|
|
||||||
|
|
||||||
use std::str::FromStr;
|
|
||||||
|
|
||||||
//project_git_version!(GIT_VERSION);
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
|
||||||
#[command(
|
|
||||||
//version = GIT_VERSION,
|
|
||||||
about = "Utility to import a Postgres data directory directly into image layers",
|
|
||||||
//long_about = "..."
|
|
||||||
)]
|
|
||||||
struct CliOpts {
|
|
||||||
/// Input Postgres data directory
|
|
||||||
pgdata: Utf8PathBuf,
|
|
||||||
|
|
||||||
/// Path to local dir where the layer files will be stored
|
|
||||||
dest_path: Utf8PathBuf,
|
|
||||||
|
|
||||||
#[arg(long, default_value_t = TenantId::from_str("42424242424242424242424242424242").unwrap())]
|
|
||||||
tenant_id: TenantId,
|
|
||||||
#[arg(long, default_value_t = TimelineId::from_str("42424242424242424242424242424242").unwrap())]
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
|
||||||
logging::init(
|
|
||||||
LogFormat::Plain,
|
|
||||||
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
|
||||||
logging::Output::Stdout,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
virtual_file::init(
|
|
||||||
100,
|
|
||||||
IoEngineKind::StdFs,
|
|
||||||
512,
|
|
||||||
);
|
|
||||||
|
|
||||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
|
||||||
.enable_all()
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let cli = CliOpts::parse();
|
|
||||||
|
|
||||||
rt.block_on(async_main(cli))?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn async_main(cli: CliOpts) -> anyhow::Result<()> {
|
|
||||||
let mut import = pg_import::PgImportEnv::init(&cli.dest_path, cli.tenant_id, cli.timeline_id).await?;
|
|
||||||
|
|
||||||
import.import_datadir(&cli.pgdata).await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -124,70 +124,21 @@ fn main() -> anyhow::Result<()> {
|
|||||||
// after setting up logging, log the effective IO engine choice and read path implementations
|
// after setting up logging, log the effective IO engine choice and read path implementations
|
||||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||||
|
info!(?conf.get_impl, "starting with get page implementation");
|
||||||
|
info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
|
||||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||||
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
|
||||||
|
|
||||||
// The tenants directory contains all the pageserver local disk state.
|
|
||||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
|
||||||
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
|
|
||||||
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
|
|
||||||
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
|
|
||||||
let tenants_path = conf.tenants_path();
|
let tenants_path = conf.tenants_path();
|
||||||
{
|
if !tenants_path.exists() {
|
||||||
let open = || {
|
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||||
nix::dir::Dir::open(
|
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
|
||||||
tenants_path.as_std_path(),
|
|
||||||
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
|
|
||||||
nix::sys::stat::Mode::empty(),
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let dirfd = match open() {
|
|
||||||
Ok(dirfd) => dirfd,
|
|
||||||
Err(e) => match e {
|
|
||||||
nix::errno::Errno::ENOENT => {
|
|
||||||
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
|
|
||||||
format!("Failed to create tenants root dir at '{tenants_path}'")
|
|
||||||
})?;
|
|
||||||
open().context("open tenants dir after creating it")?
|
|
||||||
}
|
|
||||||
e => anyhow::bail!(e),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
let started = Instant::now();
|
|
||||||
// Linux guarantees durability for syncfs.
|
|
||||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
|
||||||
#[cfg(target_os = "linux")]
|
|
||||||
{
|
|
||||||
use std::os::fd::AsRawFd;
|
|
||||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
|
||||||
}
|
|
||||||
#[cfg(target_os = "macos")]
|
|
||||||
{
|
|
||||||
// macOS is not a production platform for Neon, don't even bother.
|
|
||||||
drop(dirfd);
|
|
||||||
}
|
|
||||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
|
||||||
{
|
|
||||||
compile_error!("Unsupported OS");
|
|
||||||
}
|
|
||||||
|
|
||||||
let elapsed = started.elapsed();
|
|
||||||
info!(
|
|
||||||
elapsed_ms = elapsed.as_millis(),
|
|
||||||
"made tenant directory contents durable"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize up failpoints support
|
// Initialize up failpoints support
|
||||||
let scenario = failpoint_support::init();
|
let scenario = failpoint_support::init();
|
||||||
|
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(
|
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||||
conf.max_file_descriptors,
|
|
||||||
conf.virtual_file_io_engine,
|
|
||||||
conf.io_buffer_alignment,
|
|
||||||
);
|
|
||||||
page_cache::init(conf.page_cache_size);
|
page_cache::init(conf.page_cache_size);
|
||||||
|
|
||||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||||
|
|||||||
@@ -29,13 +29,12 @@ use utils::{
|
|||||||
logging::LogFormat,
|
logging::LogFormat,
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::l0_flush::L0FlushConfig;
|
|
||||||
use crate::tenant::config::TenantConfOpt;
|
|
||||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
|
||||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||||
|
use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
|
||||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||||
|
use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
|
||||||
use crate::{tenant::config::TenantConf, virtual_file};
|
use crate::{tenant::config::TenantConf, virtual_file};
|
||||||
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
|
||||||
|
|
||||||
@@ -51,6 +50,7 @@ pub mod defaults {
|
|||||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||||
DEFAULT_PG_LISTEN_PORT,
|
DEFAULT_PG_LISTEN_PORT,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||||
|
|
||||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||||
@@ -90,14 +90,13 @@ pub mod defaults {
|
|||||||
|
|
||||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||||
|
|
||||||
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
|
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||||
|
ImageCompressionAlgorithm::Disabled;
|
||||||
|
|
||||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||||
|
|
||||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||||
|
|
||||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
///
|
///
|
||||||
@@ -134,8 +133,14 @@ pub mod defaults {
|
|||||||
|
|
||||||
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
||||||
|
|
||||||
|
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
||||||
|
|
||||||
|
#get_impl = '{DEFAULT_GET_IMPL}'
|
||||||
|
|
||||||
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
|
#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
|
||||||
|
|
||||||
|
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||||
|
|
||||||
[tenant_config]
|
[tenant_config]
|
||||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||||
@@ -273,8 +278,14 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||||
|
|
||||||
|
pub get_vectored_impl: GetVectoredImpl,
|
||||||
|
|
||||||
|
pub get_impl: GetImpl,
|
||||||
|
|
||||||
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
pub max_vectored_read_bytes: MaxVectoredReadBytes,
|
||||||
|
|
||||||
|
pub validate_vectored_get: bool,
|
||||||
|
|
||||||
pub image_compression: ImageCompressionAlgorithm,
|
pub image_compression: ImageCompressionAlgorithm,
|
||||||
|
|
||||||
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
/// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this
|
||||||
@@ -292,8 +303,6 @@ pub struct PageServerConf {
|
|||||||
|
|
||||||
/// Direct IO settings
|
/// Direct IO settings
|
||||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||||
|
|
||||||
pub io_buffer_alignment: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -387,8 +396,14 @@ struct PageServerConfigBuilder {
|
|||||||
|
|
||||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||||
|
|
||||||
|
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
||||||
|
|
||||||
|
get_impl: BuilderValue<GetImpl>,
|
||||||
|
|
||||||
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
|
||||||
|
|
||||||
|
validate_vectored_get: BuilderValue<bool>,
|
||||||
|
|
||||||
image_compression: BuilderValue<ImageCompressionAlgorithm>,
|
image_compression: BuilderValue<ImageCompressionAlgorithm>,
|
||||||
|
|
||||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||||
@@ -398,8 +413,6 @@ struct PageServerConfigBuilder {
|
|||||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||||
|
|
||||||
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
||||||
|
|
||||||
io_buffer_alignment: BuilderValue<usize>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PageServerConfigBuilder {
|
impl PageServerConfigBuilder {
|
||||||
@@ -480,15 +493,17 @@ impl PageServerConfigBuilder {
|
|||||||
|
|
||||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||||
|
|
||||||
|
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
||||||
|
get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
|
||||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||||
)),
|
)),
|
||||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
|
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||||
|
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||||
l0_flush: Set(L0FlushConfig::default()),
|
l0_flush: Set(L0FlushConfig::default()),
|
||||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||||
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
||||||
io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -644,10 +659,22 @@ impl PageServerConfigBuilder {
|
|||||||
self.virtual_file_io_engine = BuilderValue::Set(value);
|
self.virtual_file_io_engine = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
|
||||||
|
self.get_vectored_impl = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_impl(&mut self, value: GetImpl) {
|
||||||
|
self.get_impl = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
|
pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
|
||||||
self.max_vectored_read_bytes = BuilderValue::Set(value);
|
self.max_vectored_read_bytes = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_validate_vectored_get(&mut self, value: bool) {
|
||||||
|
self.validate_vectored_get = BuilderValue::Set(value);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
|
pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
|
||||||
self.image_compression = BuilderValue::Set(value);
|
self.image_compression = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
@@ -668,10 +695,6 @@ impl PageServerConfigBuilder {
|
|||||||
self.virtual_file_direct_io = BuilderValue::Set(value);
|
self.virtual_file_direct_io = BuilderValue::Set(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn io_buffer_alignment(&mut self, value: usize) {
|
|
||||||
self.io_buffer_alignment = BuilderValue::Set(value);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||||
let default = Self::default_values();
|
let default = Self::default_values();
|
||||||
|
|
||||||
@@ -722,13 +745,15 @@ impl PageServerConfigBuilder {
|
|||||||
heatmap_upload_concurrency,
|
heatmap_upload_concurrency,
|
||||||
secondary_download_concurrency,
|
secondary_download_concurrency,
|
||||||
ingest_batch_size,
|
ingest_batch_size,
|
||||||
|
get_vectored_impl,
|
||||||
|
get_impl,
|
||||||
max_vectored_read_bytes,
|
max_vectored_read_bytes,
|
||||||
|
validate_vectored_get,
|
||||||
image_compression,
|
image_compression,
|
||||||
ephemeral_bytes_per_memory_kb,
|
ephemeral_bytes_per_memory_kb,
|
||||||
l0_flush,
|
l0_flush,
|
||||||
compact_level0_phase1_value_access,
|
compact_level0_phase1_value_access,
|
||||||
virtual_file_direct_io,
|
virtual_file_direct_io,
|
||||||
io_buffer_alignment,
|
|
||||||
}
|
}
|
||||||
CUSTOM LOGIC
|
CUSTOM LOGIC
|
||||||
{
|
{
|
||||||
@@ -977,12 +1002,21 @@ impl PageServerConf {
|
|||||||
"virtual_file_io_engine" => {
|
"virtual_file_io_engine" => {
|
||||||
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
||||||
}
|
}
|
||||||
|
"get_vectored_impl" => {
|
||||||
|
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
||||||
|
}
|
||||||
|
"get_impl" => {
|
||||||
|
builder.get_impl(parse_toml_from_str("get_impl", item)?)
|
||||||
|
}
|
||||||
"max_vectored_read_bytes" => {
|
"max_vectored_read_bytes" => {
|
||||||
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
|
let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
|
||||||
builder.get_max_vectored_read_bytes(
|
builder.get_max_vectored_read_bytes(
|
||||||
MaxVectoredReadBytes(
|
MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
|
NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
|
||||||
}
|
}
|
||||||
|
"validate_vectored_get" => {
|
||||||
|
builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
|
||||||
|
}
|
||||||
"image_compression" => {
|
"image_compression" => {
|
||||||
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
|
builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
|
||||||
}
|
}
|
||||||
@@ -998,9 +1032,6 @@ impl PageServerConf {
|
|||||||
"virtual_file_direct_io" => {
|
"virtual_file_direct_io" => {
|
||||||
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
||||||
}
|
}
|
||||||
"io_buffer_alignment" => {
|
|
||||||
builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
|
|
||||||
}
|
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1021,15 +1052,6 @@ impl PageServerConf {
|
|||||||
|
|
||||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||||
|
|
||||||
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
|
|
||||||
.map_err(|msg| anyhow::anyhow!("{msg}"))
|
|
||||||
.with_context(|| {
|
|
||||||
format!(
|
|
||||||
"effective checkpoint distance is unsupported: {}",
|
|
||||||
conf.default_tenant_conf.checkpoint_distance
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(conf)
|
Ok(conf)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1084,16 +1106,18 @@ impl PageServerConf {
|
|||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant"),
|
.expect("Invalid default constant"),
|
||||||
),
|
),
|
||||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1325,16 +1349,18 @@ background_task_maximum_delay = '334 s'
|
|||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1399,16 +1425,18 @@ background_task_maximum_delay = '334 s'
|
|||||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
ingest_batch_size: 100,
|
ingest_batch_size: 100,
|
||||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||||
|
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||||
|
get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
|
||||||
max_vectored_read_bytes: MaxVectoredReadBytes(
|
max_vectored_read_bytes: MaxVectoredReadBytes(
|
||||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||||
.expect("Invalid default constant")
|
.expect("Invalid default constant")
|
||||||
),
|
),
|
||||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||||
|
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||||
l0_flush: L0FlushConfig::default(),
|
l0_flush: L0FlushConfig::default(),
|
||||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
//! Periodically collect consumption metrics for all active tenants
|
//! Periodically collect consumption metrics for all active tenants
|
||||||
//! and push them to a HTTP endpoint.
|
//! and push them to a HTTP endpoint.
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::consumption_metrics::metrics::MetricsKey;
|
|
||||||
use crate::consumption_metrics::upload::KeyGen as _;
|
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||||
@@ -10,7 +8,6 @@ use crate::tenant::tasks::BackgroundLoopKind;
|
|||||||
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use consumption_metrics::EventType;
|
use consumption_metrics::EventType;
|
||||||
use itertools::Itertools as _;
|
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
@@ -22,8 +19,9 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
mod disk_cache;
|
|
||||||
mod metrics;
|
mod metrics;
|
||||||
|
use crate::consumption_metrics::metrics::MetricsKey;
|
||||||
|
mod disk_cache;
|
||||||
mod upload;
|
mod upload;
|
||||||
|
|
||||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||||
@@ -145,12 +143,6 @@ async fn collect_metrics(
|
|||||||
// these are point in time, with variable "now"
|
// these are point in time, with variable "now"
|
||||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||||
|
|
||||||
// Pre-generate event idempotency keys, to reuse them across the bucket
|
|
||||||
// and HTTP sinks.
|
|
||||||
let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
|
|
||||||
.take(metrics.len())
|
|
||||||
.collect_vec();
|
|
||||||
|
|
||||||
let metrics = Arc::new(metrics);
|
let metrics = Arc::new(metrics);
|
||||||
|
|
||||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||||
@@ -169,14 +161,8 @@ async fn collect_metrics(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(bucket_client) = &bucket_client {
|
if let Some(bucket_client) = &bucket_client {
|
||||||
let res = upload::upload_metrics_bucket(
|
let res =
|
||||||
bucket_client,
|
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||||
&cancel,
|
|
||||||
&node_id,
|
|
||||||
&metrics,
|
|
||||||
&idempotency_keys,
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
tracing::error!("failed to upload to S3: {e:#}");
|
tracing::error!("failed to upload to S3: {e:#}");
|
||||||
}
|
}
|
||||||
@@ -188,9 +174,9 @@ async fn collect_metrics(
|
|||||||
&client,
|
&client,
|
||||||
metric_collection_endpoint,
|
metric_collection_endpoint,
|
||||||
&cancel,
|
&cancel,
|
||||||
|
&node_id,
|
||||||
&metrics,
|
&metrics,
|
||||||
&mut cached_metrics,
|
&mut cached_metrics,
|
||||||
&idempotency_keys,
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
|
|||||||
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
|
|||||||
client: &reqwest::Client,
|
client: &reqwest::Client,
|
||||||
metric_collection_endpoint: &reqwest::Url,
|
metric_collection_endpoint: &reqwest::Url,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
|
node_id: &str,
|
||||||
metrics: &[RawMetric],
|
metrics: &[RawMetric],
|
||||||
cached_metrics: &mut Cache,
|
cached_metrics: &mut Cache,
|
||||||
idempotency_keys: &[IdempotencyKey<'_>],
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut uploaded = 0;
|
let mut uploaded = 0;
|
||||||
let mut failed = 0;
|
let mut failed = 0;
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
|
|
||||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
|
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
|
||||||
|
|
||||||
while let Some(res) = iter.next() {
|
while let Some(res) = iter.next() {
|
||||||
let (chunk, body) = res?;
|
let (chunk, body) = res?;
|
||||||
@@ -87,7 +87,6 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
node_id: &str,
|
node_id: &str,
|
||||||
metrics: &[RawMetric],
|
metrics: &[RawMetric],
|
||||||
idempotency_keys: &[IdempotencyKey<'_>],
|
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
if metrics.is_empty() {
|
if metrics.is_empty() {
|
||||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||||
@@ -107,7 +106,7 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
|
|
||||||
// Serialize and write into compressed buffer
|
// Serialize and write into compressed buffer
|
||||||
let started_at = std::time::Instant::now();
|
let started_at = std::time::Instant::now();
|
||||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
|
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||||
let (_chunk, body) = res?;
|
let (_chunk, body) = res?;
|
||||||
gzip_writer.write_all(&body).await?;
|
gzip_writer.write_all(&body).await?;
|
||||||
}
|
}
|
||||||
@@ -135,31 +134,29 @@ pub(super) async fn upload_metrics_bucket(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
|
// The return type is quite ugly, but we gain testability in isolation
|
||||||
/// idempotency keys are injected into the corresponding metric events (reused
|
fn serialize_in_chunks<'a, F>(
|
||||||
/// across different metrics sinks), and must have the same length as input.
|
|
||||||
fn serialize_in_chunks<'a>(
|
|
||||||
chunk_size: usize,
|
chunk_size: usize,
|
||||||
input: &'a [RawMetric],
|
input: &'a [RawMetric],
|
||||||
idempotency_keys: &'a [IdempotencyKey<'a>],
|
factory: F,
|
||||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||||
|
where
|
||||||
|
F: KeyGen<'a> + 'a,
|
||||||
{
|
{
|
||||||
use bytes::BufMut;
|
use bytes::BufMut;
|
||||||
|
|
||||||
assert_eq!(input.len(), idempotency_keys.len());
|
struct Iter<'a, F> {
|
||||||
|
|
||||||
struct Iter<'a> {
|
|
||||||
inner: std::slice::Chunks<'a, RawMetric>,
|
inner: std::slice::Chunks<'a, RawMetric>,
|
||||||
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
|
||||||
chunk_size: usize,
|
chunk_size: usize,
|
||||||
|
|
||||||
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
||||||
buffer: bytes::BytesMut,
|
buffer: bytes::BytesMut,
|
||||||
// chunk amount of events are reused to produce the serialized document
|
// chunk amount of events are reused to produce the serialized document
|
||||||
scratch: Vec<Event<Ids, Name>>,
|
scratch: Vec<Event<Ids, Name>>,
|
||||||
|
factory: F,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for Iter<'a> {
|
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
|
||||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
@@ -170,14 +167,17 @@ fn serialize_in_chunks<'a>(
|
|||||||
self.scratch.extend(
|
self.scratch.extend(
|
||||||
chunk
|
chunk
|
||||||
.iter()
|
.iter()
|
||||||
.zip(&mut self.idempotency_keys)
|
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
|
||||||
.map(|(raw_metric, key)| raw_metric.as_event(key)),
|
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
// next rounds: update_in_place to reuse allocations
|
// next rounds: update_in_place to reuse allocations
|
||||||
assert_eq!(self.scratch.len(), self.chunk_size);
|
assert_eq!(self.scratch.len(), self.chunk_size);
|
||||||
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
|
self.scratch
|
||||||
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
|
.iter_mut()
|
||||||
|
.zip(chunk.iter())
|
||||||
|
.for_each(|(slot, raw_metric)| {
|
||||||
|
raw_metric.update_in_place(slot, &self.factory.generate())
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let res = serde_json::to_writer(
|
let res = serde_json::to_writer(
|
||||||
@@ -198,19 +198,18 @@ fn serialize_in_chunks<'a>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ExactSizeIterator for Iter<'a> {}
|
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
|
||||||
|
|
||||||
let buffer = bytes::BytesMut::new();
|
let buffer = bytes::BytesMut::new();
|
||||||
let inner = input.chunks(chunk_size);
|
let inner = input.chunks(chunk_size);
|
||||||
let idempotency_keys = idempotency_keys.iter();
|
|
||||||
let scratch = Vec::new();
|
let scratch = Vec::new();
|
||||||
|
|
||||||
Iter {
|
Iter {
|
||||||
inner,
|
inner,
|
||||||
idempotency_keys,
|
|
||||||
chunk_size,
|
chunk_size,
|
||||||
buffer,
|
buffer,
|
||||||
scratch,
|
scratch,
|
||||||
|
factory,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,7 +268,7 @@ impl RawMetricExt for RawMetric {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) trait KeyGen<'a> {
|
trait KeyGen<'a>: Copy {
|
||||||
fn generate(&self) -> IdempotencyKey<'a>;
|
fn generate(&self) -> IdempotencyKey<'a>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -390,10 +389,7 @@ mod tests {
|
|||||||
let examples = metric_samples();
|
let examples = metric_samples();
|
||||||
assert!(examples.len() > 1);
|
assert!(examples.len() > 1);
|
||||||
|
|
||||||
let now = Utc::now();
|
let factory = FixedGen::new(Utc::now(), "1", 42);
|
||||||
let idempotency_keys = (0..examples.len())
|
|
||||||
.map(|i| FixedGen::new(now, "1", i as u16).generate())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
||||||
// hashmap
|
// hashmap
|
||||||
@@ -402,13 +398,13 @@ mod tests {
|
|||||||
events: Vec<Event<Ids, Name>>,
|
events: Vec<Event<Ids, Name>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
|
let correct = serialize_in_chunks(examples.len(), &examples, factory)
|
||||||
.map(|res| res.unwrap().1)
|
.map(|res| res.unwrap().1)
|
||||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
for chunk_size in 1..examples.len() {
|
for chunk_size in 1..examples.len() {
|
||||||
let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
|
let actual = serialize_in_chunks(chunk_size, &examples, factory)
|
||||||
.map(|res| res.unwrap().1)
|
.map(|res| res.unwrap().1)
|
||||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|||||||
@@ -105,10 +105,8 @@ pub struct RequestContext {
|
|||||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||||
pub enum PageContentKind {
|
pub enum PageContentKind {
|
||||||
Unknown,
|
Unknown,
|
||||||
DeltaLayerSummary,
|
|
||||||
DeltaLayerBtreeNode,
|
DeltaLayerBtreeNode,
|
||||||
DeltaLayerValue,
|
DeltaLayerValue,
|
||||||
ImageLayerSummary,
|
|
||||||
ImageLayerBtreeNode,
|
ImageLayerBtreeNode,
|
||||||
ImageLayerValue,
|
ImageLayerValue,
|
||||||
InMemoryLayer,
|
InMemoryLayer,
|
||||||
|
|||||||
@@ -141,18 +141,12 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
|||||||
m.other
|
m.other
|
||||||
);
|
);
|
||||||
|
|
||||||
let az_id = m
|
|
||||||
.other
|
|
||||||
.get("availability_zone_id")
|
|
||||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
|
||||||
|
|
||||||
Some(NodeRegisterRequest {
|
Some(NodeRegisterRequest {
|
||||||
node_id: conf.id,
|
node_id: conf.id,
|
||||||
listen_pg_addr: m.postgres_host,
|
listen_pg_addr: m.postgres_host,
|
||||||
listen_pg_port: m.postgres_port,
|
listen_pg_port: m.postgres_port,
|
||||||
listen_http_addr: m.http_host,
|
listen_http_addr: m.http_host,
|
||||||
listen_http_port: m.http_port,
|
listen_http_port: m.http_port,
|
||||||
availability_zone_id: az_id,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ use crate::{
|
|||||||
mgr::TenantManager,
|
mgr::TenantManager,
|
||||||
remote_timeline_client::LayerFileMetadata,
|
remote_timeline_client::LayerFileMetadata,
|
||||||
secondary::SecondaryTenant,
|
secondary::SecondaryTenant,
|
||||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
|
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
|
||||||
},
|
},
|
||||||
CancellableTask, DiskUsageEvictionTask,
|
CancellableTask, DiskUsageEvictionTask,
|
||||||
};
|
};
|
||||||
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl EvictionOrder {
|
impl EvictionOrder {
|
||||||
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
|
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
|
||||||
use EvictionOrder::*;
|
use EvictionOrder::*;
|
||||||
|
|
||||||
match self {
|
match self {
|
||||||
@@ -644,7 +644,6 @@ pub(crate) struct EvictionCandidate {
|
|||||||
pub(crate) layer: EvictionLayer,
|
pub(crate) layer: EvictionLayer,
|
||||||
pub(crate) last_activity_ts: SystemTime,
|
pub(crate) last_activity_ts: SystemTime,
|
||||||
pub(crate) relative_last_activity: finite_f32::FiniteF32,
|
pub(crate) relative_last_activity: finite_f32::FiniteF32,
|
||||||
pub(crate) visibility: LayerVisibilityHint,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for EvictionLayer {
|
impl std::fmt::Display for EvictionLayer {
|
||||||
@@ -686,22 +685,14 @@ impl std::fmt::Debug for EvictionCandidate {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
enum EvictionPartition {
|
enum MinResidentSizePartition {
|
||||||
// A layer that is un-wanted by the tenant: evict all these first, before considering
|
|
||||||
// any other layers
|
|
||||||
EvictNow,
|
|
||||||
|
|
||||||
// Above the minimum size threshold: this layer is a candidate for eviction.
|
|
||||||
Above,
|
Above,
|
||||||
|
|
||||||
// Below the minimum size threshold: this layer should only be evicted if all the
|
|
||||||
// tenants' layers above the minimum size threshold have already been considered.
|
|
||||||
Below,
|
Below,
|
||||||
}
|
}
|
||||||
|
|
||||||
enum EvictionCandidates {
|
enum EvictionCandidates {
|
||||||
Cancelled,
|
Cancelled,
|
||||||
Finished(Vec<(EvictionPartition, EvictionCandidate)>),
|
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gather the eviction candidates.
|
/// Gather the eviction candidates.
|
||||||
@@ -899,10 +890,8 @@ async fn collect_eviction_candidates(
|
|||||||
max_layer_size
|
max_layer_size
|
||||||
};
|
};
|
||||||
|
|
||||||
// Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
|
// Sort layers most-recently-used first, then partition by
|
||||||
// where the inputs are:
|
// cumsum above/below min_resident_size.
|
||||||
// - whether the layer is visible
|
|
||||||
// - whether the layer is above/below the min_resident_size cutline
|
|
||||||
tenant_candidates
|
tenant_candidates
|
||||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||||
let mut cumsum: i128 = 0;
|
let mut cumsum: i128 = 0;
|
||||||
@@ -919,23 +908,12 @@ async fn collect_eviction_candidates(
|
|||||||
candidate.relative_last_activity =
|
candidate.relative_last_activity =
|
||||||
eviction_order.relative_last_activity(total, i);
|
eviction_order.relative_last_activity(total, i);
|
||||||
|
|
||||||
let partition = match candidate.visibility {
|
let partition = if cumsum > min_resident_size as i128 {
|
||||||
LayerVisibilityHint::Covered => {
|
MinResidentSizePartition::Above
|
||||||
// Covered layers are evicted first
|
} else {
|
||||||
EvictionPartition::EvictNow
|
MinResidentSizePartition::Below
|
||||||
}
|
|
||||||
LayerVisibilityHint::Visible => {
|
|
||||||
cumsum += i128::from(candidate.layer.get_file_size());
|
|
||||||
|
|
||||||
if cumsum > min_resident_size as i128 {
|
|
||||||
EvictionPartition::Above
|
|
||||||
} else {
|
|
||||||
// The most recent layers below the min_resident_size threshold
|
|
||||||
// are the last to be evicted.
|
|
||||||
EvictionPartition::Below
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
cumsum += i128::from(candidate.layer.get_file_size());
|
||||||
|
|
||||||
(partition, candidate)
|
(partition, candidate)
|
||||||
});
|
});
|
||||||
@@ -1003,7 +981,7 @@ async fn collect_eviction_candidates(
|
|||||||
// Secondary locations' layers are always considered above the min resident size,
|
// Secondary locations' layers are always considered above the min resident size,
|
||||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||||
// the layers have sufficiently old access times.
|
// the layers have sufficiently old access times.
|
||||||
EvictionPartition::Above,
|
MinResidentSizePartition::Above,
|
||||||
candidate,
|
candidate,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
@@ -1031,9 +1009,7 @@ async fn collect_eviction_candidates(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
|
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
|
||||||
debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
|
|
||||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||||
|
|
||||||
eviction_order.sort(&mut candidates);
|
eviction_order.sort(&mut candidates);
|
||||||
@@ -1046,7 +1022,7 @@ async fn collect_eviction_candidates(
|
|||||||
///
|
///
|
||||||
/// Returns the amount of candidates selected, with the planned usage.
|
/// Returns the amount of candidates selected, with the planned usage.
|
||||||
fn select_victims<U: Usage>(
|
fn select_victims<U: Usage>(
|
||||||
candidates: &[(EvictionPartition, EvictionCandidate)],
|
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
|
||||||
usage_pre: U,
|
usage_pre: U,
|
||||||
) -> VictimSelection<U> {
|
) -> VictimSelection<U> {
|
||||||
let mut usage_when_switched = None;
|
let mut usage_when_switched = None;
|
||||||
@@ -1058,7 +1034,7 @@ fn select_victims<U: Usage>(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
|
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
|
||||||
usage_when_switched = Some((usage_planned, i));
|
usage_when_switched = Some((usage_planned, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -178,8 +178,10 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
|||||||
impl From<PageReconstructError> for ApiError {
|
impl From<PageReconstructError> for ApiError {
|
||||||
fn from(pre: PageReconstructError) -> ApiError {
|
fn from(pre: PageReconstructError) -> ApiError {
|
||||||
match pre {
|
match pre {
|
||||||
PageReconstructError::Other(other) => ApiError::InternalServerError(other),
|
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||||
PageReconstructError::MissingKey(e) => ApiError::InternalServerError(e.into()),
|
PageReconstructError::MissingKey(e) => {
|
||||||
|
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||||
|
}
|
||||||
PageReconstructError::Cancelled => ApiError::Cancelled,
|
PageReconstructError::Cancelled => ApiError::Cancelled,
|
||||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||||
@@ -318,27 +320,6 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
|
||||||
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
|
|
||||||
use crate::tenant::TimelineArchivalError::*;
|
|
||||||
match value {
|
|
||||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
|
||||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
|
||||||
e @ HasArchivedParent(_) => {
|
|
||||||
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
|
|
||||||
}
|
|
||||||
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
|
|
||||||
format!(
|
|
||||||
"Cannot archive timeline which has non-archived child timelines: {children:?}"
|
|
||||||
)
|
|
||||||
.into_boxed_str(),
|
|
||||||
),
|
|
||||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
|
||||||
Other(e) => ApiError::InternalServerError(e),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||||
@@ -426,8 +407,6 @@ async fn build_timeline_info_common(
|
|||||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||||
let state = timeline.current_state();
|
let state = timeline.current_state();
|
||||||
// Report is_archived = false if the timeline is still loading
|
|
||||||
let is_archived = timeline.is_archived().unwrap_or(false);
|
|
||||||
let remote_consistent_lsn_projected = timeline
|
let remote_consistent_lsn_projected = timeline
|
||||||
.get_remote_consistent_lsn_projected()
|
.get_remote_consistent_lsn_projected()
|
||||||
.unwrap_or(Lsn(0));
|
.unwrap_or(Lsn(0));
|
||||||
@@ -468,7 +447,6 @@ async fn build_timeline_info_common(
|
|||||||
pg_version: timeline.pg_version,
|
pg_version: timeline.pg_version,
|
||||||
|
|
||||||
state,
|
state,
|
||||||
is_archived,
|
|
||||||
|
|
||||||
walreceiver_status,
|
walreceiver_status,
|
||||||
|
|
||||||
@@ -710,7 +688,9 @@ async fn timeline_archival_config_handler(
|
|||||||
|
|
||||||
tenant
|
tenant
|
||||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||||
.await?;
|
.await
|
||||||
|
.context("applying archival config")
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
Ok::<_, ApiError>(())
|
Ok::<_, ApiError>(())
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_archival_config",
|
.instrument(info_span!("timeline_archival_config",
|
||||||
@@ -874,10 +854,7 @@ async fn get_timestamp_of_lsn_handler(
|
|||||||
|
|
||||||
match result {
|
match result {
|
||||||
Some(time) => {
|
Some(time) => {
|
||||||
let time = format_rfc3339(
|
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||||
postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
|
|
||||||
)
|
|
||||||
.to_string();
|
|
||||||
json_response(StatusCode::OK, time)
|
json_response(StatusCode::OK, time)
|
||||||
}
|
}
|
||||||
None => Err(ApiError::NotFound(
|
None => Err(ApiError::NotFound(
|
||||||
@@ -1731,12 +1708,13 @@ async fn timeline_compact_handler(
|
|||||||
flags |= CompactFlags::ForceImageLayerCreation;
|
flags |= CompactFlags::ForceImageLayerCreation;
|
||||||
}
|
}
|
||||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||||
|
if !cfg!(feature = "testing") {
|
||||||
|
return Err(ApiError::InternalServerError(anyhow!(
|
||||||
|
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||||
|
)));
|
||||||
|
}
|
||||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||||
}
|
}
|
||||||
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
|
|
||||||
flags |= CompactFlags::DryRun;
|
|
||||||
}
|
|
||||||
|
|
||||||
let wait_until_uploaded =
|
let wait_until_uploaded =
|
||||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||||
|
|
||||||
@@ -1809,11 +1787,9 @@ async fn timeline_checkpoint_handler(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if wait_until_uploaded {
|
if wait_until_uploaded {
|
||||||
tracing::info!("Waiting for uploads to complete...");
|
|
||||||
timeline.remote_client.wait_completion().await
|
timeline.remote_client.wait_completion().await
|
||||||
// XXX map to correct ApiError for the cases where it's due to shutdown
|
// XXX map to correct ApiError for the cases where it's due to shutdown
|
||||||
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
.context("wait completion").map_err(ApiError::InternalServerError)?;
|
||||||
tracing::info!("Uploads completed up to {}", timeline.get_remote_consistent_lsn_projected().unwrap_or(Lsn(0)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
@@ -1911,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
|
|||||||
// drop(tenant);
|
// drop(tenant);
|
||||||
|
|
||||||
let resp = match progress {
|
let resp = match progress {
|
||||||
detach_ancestor::Progress::Prepared(attempt, prepared) => {
|
detach_ancestor::Progress::Prepared(_guard, prepared) => {
|
||||||
// it would be great to tag the guard on to the tenant activation future
|
// it would be great to tag the guard on to the tenant activation future
|
||||||
let reparented_timelines = state
|
let reparented_timelines = state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
@@ -1919,10 +1895,11 @@ async fn timeline_detach_ancestor_handler(
|
|||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
prepared,
|
prepared,
|
||||||
attempt,
|
|
||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await
|
||||||
|
.context("timeline detach ancestor completion")
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
AncestorDetached {
|
AncestorDetached {
|
||||||
reparented_timelines,
|
reparented_timelines,
|
||||||
@@ -2354,20 +2331,6 @@ async fn put_io_engine_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn put_io_alignment_handler(
|
|
||||||
mut r: Request<Body>,
|
|
||||||
_cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
check_permission(&r, None)?;
|
|
||||||
let align: usize = json_request(&mut r).await?;
|
|
||||||
crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
|
|
||||||
ApiError::PreconditionFailed(
|
|
||||||
format!("Requested io alignment ({align}) is not a power of two").into(),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Polled by control plane.
|
/// Polled by control plane.
|
||||||
///
|
///
|
||||||
/// See [`crate::utilization`].
|
/// See [`crate::utilization`].
|
||||||
@@ -2394,9 +2357,8 @@ async fn get_utilization(
|
|||||||
// regenerate at most 1Hz to allow polling at any rate.
|
// regenerate at most 1Hz to allow polling at any rate.
|
||||||
if !still_valid {
|
if !still_valid {
|
||||||
let path = state.conf.tenants_path();
|
let path = state.conf.tenants_path();
|
||||||
let doc =
|
let doc = crate::utilization::regenerate(path.as_std_path())
|
||||||
crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
|
.map_err(ApiError::InternalServerError)?;
|
||||||
.map_err(ApiError::InternalServerError)?;
|
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
let mut buf = Vec::new();
|
||||||
serde_json::to_writer(&mut buf, &doc)
|
serde_json::to_writer(&mut buf, &doc)
|
||||||
@@ -2980,7 +2942,7 @@ pub fn make_router(
|
|||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||||
|r| api_handler(r, timeline_compact_handler),
|
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
||||||
)
|
)
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||||
@@ -3055,9 +3017,6 @@ pub fn make_router(
|
|||||||
|r| api_handler(r, timeline_collect_keyspace),
|
|r| api_handler(r, timeline_collect_keyspace),
|
||||||
)
|
)
|
||||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||||
.put("/v1/io_alignment", |r| {
|
|
||||||
api_handler(r, put_io_alignment_handler)
|
|
||||||
})
|
|
||||||
.put(
|
.put(
|
||||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||||
|
|||||||
@@ -1,10 +1,15 @@
|
|||||||
use std::{num::NonZeroUsize, sync::Arc};
|
use std::{num::NonZeroUsize, sync::Arc};
|
||||||
|
|
||||||
|
use crate::tenant::ephemeral_file;
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||||
pub enum L0FlushConfig {
|
pub enum L0FlushConfig {
|
||||||
|
PageCached,
|
||||||
#[serde(rename_all = "snake_case")]
|
#[serde(rename_all = "snake_case")]
|
||||||
Direct { max_concurrency: NonZeroUsize },
|
Direct {
|
||||||
|
max_concurrency: NonZeroUsize,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for L0FlushConfig {
|
impl Default for L0FlushConfig {
|
||||||
@@ -20,12 +25,14 @@ impl Default for L0FlushConfig {
|
|||||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||||
|
|
||||||
pub enum Inner {
|
pub enum Inner {
|
||||||
|
PageCached,
|
||||||
Direct { semaphore: tokio::sync::Semaphore },
|
Direct { semaphore: tokio::sync::Semaphore },
|
||||||
}
|
}
|
||||||
|
|
||||||
impl L0FlushGlobalState {
|
impl L0FlushGlobalState {
|
||||||
pub fn new(config: L0FlushConfig) -> Self {
|
pub fn new(config: L0FlushConfig) -> Self {
|
||||||
match config {
|
match config {
|
||||||
|
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||||
L0FlushConfig::Direct { max_concurrency } => {
|
L0FlushConfig::Direct { max_concurrency } => {
|
||||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||||
Self(Arc::new(Inner::Direct { semaphore }))
|
Self(Arc::new(Inner::Direct { semaphore }))
|
||||||
@@ -37,3 +44,13 @@ impl L0FlushGlobalState {
|
|||||||
&self.0
|
&self.0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl L0FlushConfig {
|
||||||
|
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||||
|
use L0FlushConfig::*;
|
||||||
|
match self {
|
||||||
|
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||||
|
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ pub mod l0_flush;
|
|||||||
use futures::{stream::FuturesUnordered, StreamExt};
|
use futures::{stream::FuturesUnordered, StreamExt};
|
||||||
pub use pageserver_api::keyspace;
|
pub use pageserver_api::keyspace;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
mod assert_u64_eq_usize;
|
|
||||||
pub mod aux_file;
|
pub mod aux_file;
|
||||||
pub mod metrics;
|
pub mod metrics;
|
||||||
pub mod page_cache;
|
pub mod page_cache;
|
||||||
@@ -32,7 +31,6 @@ pub mod virtual_file;
|
|||||||
pub mod walingest;
|
pub mod walingest;
|
||||||
pub mod walrecord;
|
pub mod walrecord;
|
||||||
pub mod walredo;
|
pub mod walredo;
|
||||||
pub mod pg_import;
|
|
||||||
|
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use deletion_queue::DeletionQueue;
|
use deletion_queue::DeletionQueue;
|
||||||
@@ -51,7 +49,7 @@ use tracing::{info, info_span};
|
|||||||
/// backwards-compatible changes to the metadata format.
|
/// backwards-compatible changes to the metadata format.
|
||||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||||
|
|
||||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||||
|
|
||||||
// Magic constants used to identify different kinds of files
|
// Magic constants used to identify different kinds of files
|
||||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||||
@@ -90,8 +88,6 @@ pub async fn shutdown_pageserver(
|
|||||||
) {
|
) {
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
let started_at = std::time::Instant::now();
|
|
||||||
|
|
||||||
// If the orderly shutdown below takes too long, we still want to make
|
// If the orderly shutdown below takes too long, we still want to make
|
||||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||||
//
|
//
|
||||||
@@ -245,10 +241,7 @@ pub async fn shutdown_pageserver(
|
|||||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||||
info!("walredo_extraordinary_shutdown_thread done");
|
info!("walredo_extraordinary_shutdown_thread done");
|
||||||
|
|
||||||
info!(
|
info!("Shut down successfully completed");
|
||||||
elapsed_ms = started_at.elapsed().as_millis(),
|
|
||||||
"Shut down successfully completed"
|
|
||||||
);
|
|
||||||
std::process::exit(exit_code);
|
std::process::exit(exit_code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1552,6 +1552,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
|||||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||||
pub(crate) enum ComputeCommandKind {
|
pub(crate) enum ComputeCommandKind {
|
||||||
PageStreamV2,
|
PageStreamV2,
|
||||||
|
PageStream,
|
||||||
Basebackup,
|
Basebackup,
|
||||||
Fullbackup,
|
Fullbackup,
|
||||||
LeaseLsn,
|
LeaseLsn,
|
||||||
@@ -1802,23 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
|
|
||||||
register_uint_gauge!(
|
|
||||||
"pageserver_utilization_score",
|
|
||||||
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric")
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|
||||||
register_uint_gauge_vec!(
|
|
||||||
"pageserver_secondary_heatmap_total_size",
|
|
||||||
"The total size in bytes of all layers in the most recently downloaded heatmap.",
|
|
||||||
&["tenant_id", "shard_id"]
|
|
||||||
)
|
|
||||||
.expect("failed to define a metric")
|
|
||||||
});
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
pub enum RemoteOpKind {
|
pub enum RemoteOpKind {
|
||||||
Upload,
|
Upload,
|
||||||
@@ -1869,64 +1853,16 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
.expect("Failed to register tenant_task_events metric")
|
.expect("Failed to register tenant_task_events metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub struct BackgroundLoopSemaphoreMetrics {
|
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
register_int_counter_pair_vec!(
|
||||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
"pageserver_background_loop_semaphore_wait_start_count",
|
||||||
}
|
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||||
|
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||||
|| {
|
&["task"],
|
||||||
let counters = register_int_counter_pair_vec!(
|
)
|
||||||
"pageserver_background_loop_semaphore_wait_start_count",
|
.unwrap()
|
||||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
});
|
||||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
|
||||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
|
||||||
&["task"],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let durations = register_counter_vec!(
|
|
||||||
"pageserver_background_loop_semaphore_wait_duration_seconds",
|
|
||||||
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
|
|
||||||
&["task"],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
BackgroundLoopSemaphoreMetrics {
|
|
||||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
|
||||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
|
||||||
counters.with_label_values(&[kind.into()])
|
|
||||||
})),
|
|
||||||
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
|
||||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
|
||||||
durations.with_label_values(&[kind.into()])
|
|
||||||
})),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
impl BackgroundLoopSemaphoreMetrics {
|
|
||||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
|
||||||
struct Record<'a> {
|
|
||||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
|
||||||
task: BackgroundLoopKind,
|
|
||||||
_counter_guard: metrics::IntCounterPairGuard,
|
|
||||||
start: Instant,
|
|
||||||
}
|
|
||||||
impl Drop for Record<'_> {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
let elapsed = self.start.elapsed().as_secs_f64();
|
|
||||||
self.metrics.durations[self.task].inc_by(elapsed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Record {
|
|
||||||
metrics: self,
|
|
||||||
task,
|
|
||||||
_counter_guard: self.counters[task].guard(),
|
|
||||||
start: Instant::now(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
@@ -2608,7 +2544,6 @@ use std::time::{Duration, Instant};
|
|||||||
use crate::context::{PageContentKind, RequestContext};
|
use crate::context::{PageContentKind, RequestContext};
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::mgr::TenantSlot;
|
use crate::tenant::mgr::TenantSlot;
|
||||||
use crate::tenant::tasks::BackgroundLoopKind;
|
|
||||||
|
|
||||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||||
|
|||||||
@@ -557,7 +557,7 @@ impl PageServerHandler {
|
|||||||
pgb: &mut PostgresBackend<IO>,
|
pgb: &mut PostgresBackend<IO>,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
_protocol_version: PagestreamProtocolVersion,
|
protocol_version: PagestreamProtocolVersion,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
) -> Result<(), QueryError>
|
) -> Result<(), QueryError>
|
||||||
where
|
where
|
||||||
@@ -601,7 +601,8 @@ impl PageServerHandler {
|
|||||||
fail::fail_point!("ps::handle-pagerequest-message");
|
fail::fail_point!("ps::handle-pagerequest-message");
|
||||||
|
|
||||||
// parse request
|
// parse request
|
||||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
let neon_fe_msg =
|
||||||
|
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||||
|
|
||||||
// invoke handler function
|
// invoke handler function
|
||||||
let (handler_result, span) = match neon_fe_msg {
|
let (handler_result, span) = match neon_fe_msg {
|
||||||
@@ -753,21 +754,16 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if request_lsn < **latest_gc_cutoff_lsn {
|
if request_lsn < **latest_gc_cutoff_lsn {
|
||||||
let gc_info = &timeline.gc_info.read().unwrap();
|
// Check explicitly for INVALID just to get a less scary error message if the
|
||||||
if !gc_info.leases.contains_key(&request_lsn) {
|
// request is obviously bogus
|
||||||
// The requested LSN is below gc cutoff and is not guarded by a lease.
|
return Err(if request_lsn == Lsn::INVALID {
|
||||||
|
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||||
// Check explicitly for INVALID just to get a less scary error message if the
|
} else {
|
||||||
// request is obviously bogus
|
PageStreamError::BadRequest(format!(
|
||||||
return Err(if request_lsn == Lsn::INVALID {
|
|
||||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
|
||||||
} else {
|
|
||||||
PageStreamError::BadRequest(format!(
|
|
||||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||||
request_lsn, **latest_gc_cutoff_lsn
|
request_lsn, **latest_gc_cutoff_lsn
|
||||||
).into())
|
).into())
|
||||||
});
|
});
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||||
@@ -794,8 +790,6 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handles the lsn lease request.
|
|
||||||
/// If a lease cannot be obtained, the client will receive NULL.
|
|
||||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||||
async fn handle_make_lsn_lease<IO>(
|
async fn handle_make_lsn_lease<IO>(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -818,25 +812,19 @@ impl PageServerHandler {
|
|||||||
.await?;
|
.await?;
|
||||||
set_tracing_field_shard_id(&timeline);
|
set_tracing_field_shard_id(&timeline);
|
||||||
|
|
||||||
let lease = timeline
|
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
|
||||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
let valid_until = lease
|
||||||
.inspect_err(|e| {
|
.valid_until
|
||||||
warn!("{e}");
|
.duration_since(SystemTime::UNIX_EPOCH)
|
||||||
})
|
.map_err(|e| QueryError::Other(e.into()))?;
|
||||||
.ok();
|
|
||||||
let valid_until_str = lease.map(|l| {
|
|
||||||
l.valid_until
|
|
||||||
.duration_since(SystemTime::UNIX_EPOCH)
|
|
||||||
.expect("valid_until is earlier than UNIX_EPOCH")
|
|
||||||
.as_millis()
|
|
||||||
.to_string()
|
|
||||||
});
|
|
||||||
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
|
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||||
b"valid_until",
|
b"valid_until",
|
||||||
)]))?
|
)]))?
|
||||||
.write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
|
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||||
|
&valid_until.as_millis().to_be_bytes(),
|
||||||
|
)]))?
|
||||||
|
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1287,6 +1275,35 @@ where
|
|||||||
ctx,
|
ctx,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||||
|
if params.len() != 2 {
|
||||||
|
return Err(QueryError::Other(anyhow::anyhow!(
|
||||||
|
"invalid param number for pagestream command"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let tenant_id = TenantId::from_str(params[0])
|
||||||
|
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||||
|
let timeline_id = TimelineId::from_str(params[1])
|
||||||
|
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||||
|
|
||||||
|
tracing::Span::current()
|
||||||
|
.record("tenant_id", field::display(tenant_id))
|
||||||
|
.record("timeline_id", field::display(timeline_id));
|
||||||
|
|
||||||
|
self.check_permission(Some(tenant_id))?;
|
||||||
|
|
||||||
|
COMPUTE_COMMANDS_COUNTERS
|
||||||
|
.for_command(ComputeCommandKind::PageStream)
|
||||||
|
.inc();
|
||||||
|
|
||||||
|
self.handle_pagerequests(
|
||||||
|
pgb,
|
||||||
|
tenant_id,
|
||||||
|
timeline_id,
|
||||||
|
PagestreamProtocolVersion::V1,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||||
if params.len() < 2 {
|
if params.len() < 2 {
|
||||||
return Err(QueryError::Other(anyhow::anyhow!(
|
return Err(QueryError::Other(anyhow::anyhow!(
|
||||||
|
|||||||
@@ -1,650 +0,0 @@
|
|||||||
use std::fs::metadata;
|
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
|
||||||
use bytes::Bytes;
|
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
|
||||||
|
|
||||||
use itertools::Itertools;
|
|
||||||
use pageserver_api::{key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, reltag::RelTag};
|
|
||||||
use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, ControlFileData, BLCKSZ};
|
|
||||||
use tokio::{io::AsyncRead, task::{self, JoinHandle}};
|
|
||||||
use tracing::debug;
|
|
||||||
use utils::{id::{NodeId, TenantId, TimelineId}, shard::{ShardCount, ShardNumber, TenantShardId}};
|
|
||||||
use walkdir::WalkDir;
|
|
||||||
|
|
||||||
use crate::{context::{DownloadBehavior, RequestContext}, pgdatadir_mapping::{DbDirectory, RelDirectory}, task_mgr::TaskKind, tenant::storage_layer::ImageLayerWriter};
|
|
||||||
use crate::pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory};
|
|
||||||
use crate::config::PageServerConf;
|
|
||||||
use tokio::io::AsyncReadExt;
|
|
||||||
|
|
||||||
use crate::tenant::storage_layer::PersistentLayerDesc;
|
|
||||||
use utils::generation::Generation;
|
|
||||||
use utils::lsn::Lsn;
|
|
||||||
use crate::tenant::IndexPart;
|
|
||||||
use crate::tenant::metadata::TimelineMetadata;
|
|
||||||
use crate::tenant::remote_timeline_client;
|
|
||||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
|
||||||
use pageserver_api::shard::ShardIndex;
|
|
||||||
use pageserver_api::key::Key;
|
|
||||||
use pageserver_api::keyspace::{is_contiguous_range, contiguous_range_len};
|
|
||||||
use pageserver_api::keyspace::singleton_range;
|
|
||||||
use pageserver_api::reltag::SlruKind;
|
|
||||||
use pageserver_api::key::{slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, TWOPHASEDIR_KEY, CONTROLFILE_KEY, CHECKPOINT_KEY};
|
|
||||||
use utils::bin_ser::BeSer;
|
|
||||||
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
pub struct PgImportEnv {
|
|
||||||
conf: &'static PageServerConf,
|
|
||||||
tli: TimelineId,
|
|
||||||
tsi: TenantShardId,
|
|
||||||
|
|
||||||
pgdata_lsn: Lsn,
|
|
||||||
|
|
||||||
tasks: Vec<AnyImportTask>,
|
|
||||||
|
|
||||||
layers: Vec<PersistentLayerDesc>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PgImportEnv {
|
|
||||||
|
|
||||||
pub async fn init(dstdir: &Utf8Path, tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<PgImportEnv> {
|
|
||||||
let config = toml_edit::Document::new();
|
|
||||||
let conf = PageServerConf::parse_and_validate(
|
|
||||||
NodeId(42),
|
|
||||||
&config,
|
|
||||||
dstdir
|
|
||||||
)?;
|
|
||||||
let conf = Box::leak(Box::new(conf));
|
|
||||||
|
|
||||||
let tsi = TenantShardId {
|
|
||||||
tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(PgImportEnv {
|
|
||||||
conf,
|
|
||||||
tli: timeline_id,
|
|
||||||
tsi,
|
|
||||||
pgdata_lsn: Lsn(0), // Will be filled in later, when the control file is imported
|
|
||||||
|
|
||||||
tasks: Vec::new(),
|
|
||||||
layers: Vec::new(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn import_datadir(&mut self, pgdata_path: &Utf8PathBuf) -> anyhow::Result<()> {
|
|
||||||
// Read control file
|
|
||||||
let controlfile_path = pgdata_path.join("global").join("pg_control");
|
|
||||||
let controlfile_buf = std::fs::read(&controlfile_path)
|
|
||||||
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
|
|
||||||
let control_file = ControlFileData::decode(&controlfile_buf)?;
|
|
||||||
|
|
||||||
let pgdata_lsn = Lsn(control_file.checkPoint).align();
|
|
||||||
let timeline_path = self.conf.timeline_path(&self.tsi, &self.tli);
|
|
||||||
|
|
||||||
println!("Importing {pgdata_path} to {timeline_path} as lsn {pgdata_lsn}...");
|
|
||||||
self.pgdata_lsn = pgdata_lsn;
|
|
||||||
|
|
||||||
let datadir = PgDataDir::new(pgdata_path);
|
|
||||||
|
|
||||||
// Import dbdir (00:00:00 keyspace)
|
|
||||||
// This is just constructed here, but will be written to the image layer in the first call to import_db()
|
|
||||||
let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
|
|
||||||
dbdirs: datadir.dbs.iter().map(|db| ((db.spcnode, db.dboid), true)).collect(),
|
|
||||||
})?);
|
|
||||||
self.tasks.push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
|
|
||||||
|
|
||||||
// Import databases (00:spcnode:dbnode keyspace for each db)
|
|
||||||
for db in datadir.dbs {
|
|
||||||
self.import_db(&db).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Import SLRUs
|
|
||||||
|
|
||||||
// pg_xact (01:00 keyspace)
|
|
||||||
self.import_slru(SlruKind::Clog, &pgdata_path.join("pg_xact")).await?;
|
|
||||||
// pg_multixact/members (01:01 keyspace)
|
|
||||||
self.import_slru(SlruKind::MultiXactMembers, &pgdata_path.join("pg_multixact/members")).await?;
|
|
||||||
// pg_multixact/offsets (01:02 keyspace)
|
|
||||||
self.import_slru(SlruKind::MultiXactOffsets, &pgdata_path.join("pg_multixact/offsets")).await?;
|
|
||||||
|
|
||||||
// Import pg_twophase.
|
|
||||||
// TODO: as empty
|
|
||||||
let twophasedir_buf = TwoPhaseDirectory::ser(
|
|
||||||
&TwoPhaseDirectory { xids: HashSet::new() }
|
|
||||||
)?;
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(TWOPHASEDIR_KEY, Bytes::from(twophasedir_buf))));
|
|
||||||
|
|
||||||
// Controlfile, checkpoint
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(CONTROLFILE_KEY, Bytes::from(controlfile_buf))));
|
|
||||||
|
|
||||||
let checkpoint_buf = control_file.checkPointCopy.encode()?;
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(CHECKPOINT_KEY, checkpoint_buf)));
|
|
||||||
|
|
||||||
// Assigns parts of key space to later parallel jobs
|
|
||||||
let mut last_end_key = Key::MIN;
|
|
||||||
let mut current_chunk = Vec::new();
|
|
||||||
let mut current_chunk_size: usize = 0;
|
|
||||||
let mut parallel_jobs = Vec::new();
|
|
||||||
for task in std::mem::take(&mut self.tasks).into_iter() {
|
|
||||||
if current_chunk_size + task.total_size() > 1024*1024*1024 {
|
|
||||||
let key_range = last_end_key..task.key_range().start;
|
|
||||||
parallel_jobs.push(ChunkProcessingJob::new(
|
|
||||||
key_range.clone(),
|
|
||||||
std::mem::take(&mut current_chunk),
|
|
||||||
self
|
|
||||||
));
|
|
||||||
last_end_key = key_range.end;
|
|
||||||
current_chunk_size = 0;
|
|
||||||
}
|
|
||||||
current_chunk_size += task.total_size();
|
|
||||||
current_chunk.push(task);
|
|
||||||
}
|
|
||||||
parallel_jobs.push(ChunkProcessingJob::new(
|
|
||||||
last_end_key..Key::NON_L0_MAX,
|
|
||||||
current_chunk,
|
|
||||||
self
|
|
||||||
));
|
|
||||||
|
|
||||||
// Start all jobs simultaneosly
|
|
||||||
// TODO: semaphore?
|
|
||||||
let mut handles = vec![];
|
|
||||||
for job in parallel_jobs {
|
|
||||||
let handle: JoinHandle<anyhow::Result<PersistentLayerDesc>> = task::spawn(async move {
|
|
||||||
let layerdesc = job.run().await?;
|
|
||||||
Ok(layerdesc)
|
|
||||||
});
|
|
||||||
handles.push(handle);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Wait for all jobs to complete
|
|
||||||
for handle in handles {
|
|
||||||
let layerdesc = handle.await??;
|
|
||||||
self.layers.push(layerdesc);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create index_part.json file
|
|
||||||
self.create_index_part(&control_file).await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn import_db(
|
|
||||||
&mut self,
|
|
||||||
db: &PgDataDirDb,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
debug!(
|
|
||||||
"Importing database (path={}, tablespace={}, dboid={})",
|
|
||||||
db.path, db.spcnode, db.dboid
|
|
||||||
);
|
|
||||||
|
|
||||||
// Import relmap (00:spcnode:dbnode:00:*:00)
|
|
||||||
let relmap_key = relmap_file_key(db.spcnode, db.dboid);
|
|
||||||
debug!("Constructing relmap entry, key {relmap_key}");
|
|
||||||
let mut relmap_file = tokio::fs::File::open(&db.path.join("pg_filenode.map")).await?;
|
|
||||||
let relmap_buf = read_all_bytes(&mut relmap_file).await?;
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(relmap_key, relmap_buf)));
|
|
||||||
|
|
||||||
// Import reldir (00:spcnode:dbnode:00:*:01)
|
|
||||||
let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
|
|
||||||
debug!("Constructing reldirs entry, key {reldir_key}");
|
|
||||||
let reldir_buf = RelDirectory::ser(&RelDirectory {
|
|
||||||
rels: db.files.iter().map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)).collect(),
|
|
||||||
})?;
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(reldir_key, Bytes::from(reldir_buf))));
|
|
||||||
|
|
||||||
// Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
|
|
||||||
// segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
|
|
||||||
for file in &db.files {
|
|
||||||
let len = metadata(&file.path)?.len() as usize;
|
|
||||||
ensure!(len % 8192 == 0);
|
|
||||||
let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
|
|
||||||
let start_key = rel_block_to_key(file.rel_tag, start_blk);
|
|
||||||
let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
|
|
||||||
self.tasks.push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(start_key..end_key, &file.path)));
|
|
||||||
|
|
||||||
// Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
|
|
||||||
if let Some(nblocks) = file.nblocks {
|
|
||||||
let size_key = rel_size_to_key(file.rel_tag);
|
|
||||||
//debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
|
|
||||||
let buf = nblocks.to_le_bytes();
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(size_key, Bytes::from(buf.to_vec()))));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn import_slru(
|
|
||||||
&mut self,
|
|
||||||
kind: SlruKind,
|
|
||||||
path: &Utf8PathBuf,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let segments: Vec<(String, u32)> = WalkDir::new(path)
|
|
||||||
.max_depth(1)
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|entry| {
|
|
||||||
let entry = entry.ok()?;
|
|
||||||
let filename = entry.file_name();
|
|
||||||
let filename = filename.to_string_lossy();
|
|
||||||
let segno = u32::from_str_radix(&filename, 16).ok()?;
|
|
||||||
Some((filename.to_string(), segno))
|
|
||||||
}).collect();
|
|
||||||
|
|
||||||
// Write SlruDir
|
|
||||||
let slrudir_key = slru_dir_to_key(kind);
|
|
||||||
let segnos: HashSet<u32> = segments.iter().map(|(_path, segno)| { *segno }).collect();
|
|
||||||
let slrudir = SlruSegmentDirectory {
|
|
||||||
segments: segnos,
|
|
||||||
};
|
|
||||||
let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(slrudir_key, Bytes::from(slrudir_buf))));
|
|
||||||
|
|
||||||
for (segpath, segno) in segments {
|
|
||||||
// SlruSegBlocks for each segment
|
|
||||||
let p = path.join(Utf8PathBuf::from(segpath));
|
|
||||||
let file_size = std::fs::metadata(&p)?.len();
|
|
||||||
ensure!(file_size % 8192 == 0);
|
|
||||||
let nblocks = u32::try_from(file_size / 8192)?;
|
|
||||||
let start_key = slru_block_to_key(kind, segno, 0);
|
|
||||||
let end_key = slru_block_to_key(kind, segno, nblocks);
|
|
||||||
self.tasks.push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(start_key..end_key, &p)));
|
|
||||||
|
|
||||||
// Followed by SlruSegSize
|
|
||||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
|
||||||
let segsize_buf = nblocks.to_le_bytes();
|
|
||||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(segsize_key, Bytes::copy_from_slice(&segsize_buf))));
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_index_part(&mut self, control_file: &ControlFileData) -> anyhow::Result<()> {
|
|
||||||
let dstdir = &self.conf.workdir;
|
|
||||||
|
|
||||||
let pg_version = match control_file.catalog_version_no {
|
|
||||||
// thesea are from catversion.h
|
|
||||||
202107181 => 14,
|
|
||||||
202209061 => 15,
|
|
||||||
202307071 => 16,
|
|
||||||
catversion => { bail!("unrecognized catalog version {catversion}")},
|
|
||||||
};
|
|
||||||
|
|
||||||
let metadata = TimelineMetadata::new(
|
|
||||||
// FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
|
|
||||||
// checkpoint record, and prev_record_lsn should point to its beginning.
|
|
||||||
// We should read the real end of the record from the WAL, but here we
|
|
||||||
// just fake it.
|
|
||||||
Lsn(self.pgdata_lsn.0 + 8),
|
|
||||||
Some(self.pgdata_lsn),
|
|
||||||
None, // no ancestor
|
|
||||||
Lsn(0),
|
|
||||||
self.pgdata_lsn, // latest_gc_cutoff_lsn
|
|
||||||
self.pgdata_lsn, // initdb_lsn
|
|
||||||
pg_version,
|
|
||||||
);
|
|
||||||
let generation = Generation::none();
|
|
||||||
let mut index_part = IndexPart::empty(metadata);
|
|
||||||
|
|
||||||
for l in self.layers.iter() {
|
|
||||||
let name = l.layer_name();
|
|
||||||
let metadata = LayerFileMetadata::new(l.file_size, generation, ShardIndex::unsharded());
|
|
||||||
if let Some(_) = index_part.layer_metadata.insert(name.clone(), metadata) {
|
|
||||||
bail!("duplicate layer filename {name}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let data = index_part.to_s3_bytes()?;
|
|
||||||
let path = remote_timeline_client::remote_index_path(&self.tsi, &self.tli, generation);
|
|
||||||
let path = dstdir.join(path.get_path());
|
|
||||||
std::fs::write(&path, data)
|
|
||||||
.context("could not write {path}")?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// dbdir iteration tools
|
|
||||||
//
|
|
||||||
|
|
||||||
struct PgDataDir {
|
|
||||||
pub dbs: Vec<PgDataDirDb> // spcnode, dboid, path
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PgDataDirDb {
|
|
||||||
pub spcnode: u32,
|
|
||||||
pub dboid: u32,
|
|
||||||
pub path: Utf8PathBuf,
|
|
||||||
pub files: Vec<PgDataDirDbFile>
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PgDataDirDbFile {
|
|
||||||
pub path: Utf8PathBuf,
|
|
||||||
pub rel_tag: RelTag,
|
|
||||||
pub segno: u32,
|
|
||||||
|
|
||||||
// Cummulative size of the given fork, set only for the last segment of that fork
|
|
||||||
pub nblocks: Option<usize>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PgDataDir {
|
|
||||||
fn new(datadir_path: &Utf8PathBuf) -> Self {
|
|
||||||
// Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
|
|
||||||
// Traverse database in increasing oid order
|
|
||||||
let mut databases = WalkDir::new(datadir_path.join("base"))
|
|
||||||
.max_depth(1)
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|entry| {
|
|
||||||
entry.ok().and_then(|path| {
|
|
||||||
path.file_name().to_string_lossy().parse::<u32>().ok()
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.sorted()
|
|
||||||
.map(|dboid| {
|
|
||||||
PgDataDirDb::new(
|
|
||||||
datadir_path.join("base").join(dboid.to_string()),
|
|
||||||
pg_constants::DEFAULTTABLESPACE_OID,
|
|
||||||
dboid,
|
|
||||||
datadir_path
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
// special case for global catalogs
|
|
||||||
databases.push(PgDataDirDb::new(
|
|
||||||
datadir_path.join("global"),
|
|
||||||
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
|
|
||||||
0,
|
|
||||||
datadir_path,
|
|
||||||
));
|
|
||||||
|
|
||||||
databases.sort_by_key(|db| (db.spcnode, db.dboid));
|
|
||||||
|
|
||||||
Self {
|
|
||||||
dbs: databases
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PgDataDirDb {
|
|
||||||
fn new(db_path: Utf8PathBuf, spcnode: u32, dboid: u32, datadir_path: &Utf8PathBuf) -> Self {
|
|
||||||
let mut files: Vec<PgDataDirDbFile> = WalkDir::new(&db_path)
|
|
||||||
.min_depth(1)
|
|
||||||
.max_depth(2)
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|entry| {
|
|
||||||
entry.ok().and_then(|path| {
|
|
||||||
let relfile = path.file_name().to_string_lossy();
|
|
||||||
// returns (relnode, forknum, segno)
|
|
||||||
parse_relfilename(&relfile).ok()
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.sorted()
|
|
||||||
.map(|(relnode, forknum, segno)| {
|
|
||||||
let rel_tag = RelTag {
|
|
||||||
spcnode,
|
|
||||||
dbnode: dboid,
|
|
||||||
relnode,
|
|
||||||
forknum,
|
|
||||||
};
|
|
||||||
|
|
||||||
let path = datadir_path.join(rel_tag.to_segfile_name(segno));
|
|
||||||
let len = metadata(&path).unwrap().len() as usize;
|
|
||||||
assert!(len % BLCKSZ as usize == 0);
|
|
||||||
let nblocks = len / BLCKSZ as usize;
|
|
||||||
|
|
||||||
PgDataDirDbFile {
|
|
||||||
path,
|
|
||||||
rel_tag,
|
|
||||||
segno,
|
|
||||||
nblocks: Some(nblocks), // first non-cummulative sizes
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
// Set cummulative sizes. Do all of that math here, so that later we could easier
|
|
||||||
// parallelize over segments and know with which segments we need to write relsize
|
|
||||||
// entry.
|
|
||||||
let mut cumulative_nblocks: usize= 0;
|
|
||||||
let mut prev_rel_tag: Option<RelTag> = None;
|
|
||||||
for i in 0..files.len() {
|
|
||||||
if prev_rel_tag == Some(files[i].rel_tag) {
|
|
||||||
cumulative_nblocks += files[i].nblocks.unwrap();
|
|
||||||
} else {
|
|
||||||
cumulative_nblocks = files[i].nblocks.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
files[i].nblocks = if i == files.len() - 1 || files[i+1].rel_tag != files[i].rel_tag {
|
|
||||||
Some(cumulative_nblocks)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
prev_rel_tag = Some(files[i].rel_tag);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
PgDataDirDb {
|
|
||||||
files,
|
|
||||||
path: db_path,
|
|
||||||
spcnode,
|
|
||||||
dboid,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> anyhow::Result<Bytes> {
|
|
||||||
let mut buf: Vec<u8> = vec![];
|
|
||||||
reader.read_to_end(&mut buf).await?;
|
|
||||||
Ok(Bytes::from(buf))
|
|
||||||
}
|
|
||||||
|
|
||||||
trait ImportTask {
|
|
||||||
fn key_range(&self) -> Range<Key>;
|
|
||||||
|
|
||||||
fn total_size(&self) -> usize {
|
|
||||||
if is_contiguous_range(&self.key_range()) {
|
|
||||||
contiguous_range_len(&self.key_range()) as usize * 8192
|
|
||||||
} else {
|
|
||||||
u32::MAX as usize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()>;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ImportSingleKeyTask {
|
|
||||||
key: Key,
|
|
||||||
buf: Bytes,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportSingleKeyTask {
|
|
||||||
fn new(key: Key, buf: Bytes) -> Self {
|
|
||||||
ImportSingleKeyTask { key, buf }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportTask for ImportSingleKeyTask {
|
|
||||||
fn key_range(&self) -> Range<Key> {
|
|
||||||
singleton_range(self.key)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
|
||||||
layer_writer.put_image(self.key, self.buf, ctx).await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ImportRelBlocksTask {
|
|
||||||
key_range: Range<Key>,
|
|
||||||
path: Utf8PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportRelBlocksTask {
|
|
||||||
fn new(key_range: Range<Key>, path: &Utf8Path) -> Self {
|
|
||||||
ImportRelBlocksTask {
|
|
||||||
key_range,
|
|
||||||
path: path.into()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportTask for ImportRelBlocksTask {
|
|
||||||
fn key_range(&self) -> Range<Key> {
|
|
||||||
self.key_range.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
|
||||||
debug!("Importing relation file {}", self.path);
|
|
||||||
let mut reader = tokio::fs::File::open(&self.path).await?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
|
||||||
|
|
||||||
let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
|
|
||||||
let (_rel_tag, end_blk) = self.key_range.end.to_rel_block()?;
|
|
||||||
let mut blknum = start_blk;
|
|
||||||
while blknum < end_blk {
|
|
||||||
reader.read_exact(&mut buf).await?;
|
|
||||||
let key = rel_block_to_key(rel_tag.clone(), blknum);
|
|
||||||
layer_writer.put_image(key, Bytes::copy_from_slice(&buf), ctx).await?;
|
|
||||||
blknum += 1;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ImportSlruBlocksTask {
|
|
||||||
key_range: Range<Key>,
|
|
||||||
path: Utf8PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportSlruBlocksTask {
|
|
||||||
fn new(key_range: Range<Key>, path: &Utf8Path) -> Self {
|
|
||||||
ImportSlruBlocksTask {
|
|
||||||
key_range,
|
|
||||||
path: path.into()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportTask for ImportSlruBlocksTask {
|
|
||||||
fn key_range(&self) -> Range<Key> {
|
|
||||||
self.key_range.clone()
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
|
||||||
debug!("Importing SLRU segment file {}", self.path);
|
|
||||||
let mut reader = tokio::fs::File::open(&self.path).await
|
|
||||||
.context(format!("opening {}", &self.path))?;
|
|
||||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
|
||||||
|
|
||||||
let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
|
|
||||||
let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
|
|
||||||
let mut blknum = start_blk;
|
|
||||||
while blknum < end_blk {
|
|
||||||
reader.read_exact(&mut buf).await?;
|
|
||||||
let key = slru_block_to_key(kind, segno, blknum);
|
|
||||||
layer_writer.put_image(key, Bytes::copy_from_slice(&buf), ctx).await?;
|
|
||||||
blknum += 1;
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum AnyImportTask {
|
|
||||||
SingleKey(ImportSingleKeyTask),
|
|
||||||
RelBlocks(ImportRelBlocksTask),
|
|
||||||
SlruBlocks(ImportSlruBlocksTask),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ImportTask for AnyImportTask {
|
|
||||||
fn key_range(&self) -> Range<Key> {
|
|
||||||
match self {
|
|
||||||
Self::SingleKey(t) => t.key_range(),
|
|
||||||
Self::RelBlocks(t) => t.key_range(),
|
|
||||||
Self::SlruBlocks(t) => t.key_range()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
|
||||||
match self {
|
|
||||||
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
|
|
||||||
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
|
|
||||||
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ImportSingleKeyTask> for AnyImportTask {
|
|
||||||
fn from(t: ImportSingleKeyTask) -> Self {
|
|
||||||
Self::SingleKey(t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ImportRelBlocksTask> for AnyImportTask {
|
|
||||||
fn from(t: ImportRelBlocksTask) -> Self {
|
|
||||||
Self::RelBlocks(t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ImportSlruBlocksTask> for AnyImportTask {
|
|
||||||
fn from(t: ImportSlruBlocksTask) -> Self {
|
|
||||||
Self::SlruBlocks(t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ChunkProcessingJob {
|
|
||||||
range: Range<Key>,
|
|
||||||
tasks: Vec<AnyImportTask>,
|
|
||||||
|
|
||||||
dstdir: Utf8PathBuf,
|
|
||||||
tenant_id: TenantId,
|
|
||||||
timeline_id: TimelineId,
|
|
||||||
pgdata_lsn: Lsn,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ChunkProcessingJob {
|
|
||||||
fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &PgImportEnv) -> Self {
|
|
||||||
assert!(env.pgdata_lsn.is_valid());
|
|
||||||
Self {
|
|
||||||
range,
|
|
||||||
tasks,
|
|
||||||
dstdir: env.conf.workdir.clone(),
|
|
||||||
tenant_id: env.tsi.tenant_id,
|
|
||||||
timeline_id: env.tli,
|
|
||||||
pgdata_lsn: env.pgdata_lsn,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn run(self) -> anyhow::Result<PersistentLayerDesc> {
|
|
||||||
let ctx: RequestContext = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
|
||||||
let config = toml_edit::Document::new();
|
|
||||||
let conf: &'static PageServerConf = Box::leak(Box::new(PageServerConf::parse_and_validate(
|
|
||||||
NodeId(42),
|
|
||||||
&config,
|
|
||||||
&self.dstdir
|
|
||||||
)?));
|
|
||||||
let tsi = TenantShardId {
|
|
||||||
tenant_id: self.tenant_id,
|
|
||||||
shard_number: ShardNumber(0),
|
|
||||||
shard_count: ShardCount(0),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut layer = ImageLayerWriter::new(
|
|
||||||
&conf,
|
|
||||||
self.timeline_id,
|
|
||||||
tsi,
|
|
||||||
&self.range,
|
|
||||||
self.pgdata_lsn,
|
|
||||||
&ctx,
|
|
||||||
).await?;
|
|
||||||
|
|
||||||
for task in self.tasks {
|
|
||||||
task.doit(&mut layer, &ctx).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let layerdesc = layer.finish_raw(&ctx).await?;
|
|
||||||
Ok(layerdesc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -12,14 +12,15 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
|
|||||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use crate::{aux_file, repository::*};
|
use crate::{aux_file, repository::*};
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{ensure, Context};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use enum_map::Enum;
|
use enum_map::Enum;
|
||||||
|
use itertools::Itertools;
|
||||||
use pageserver_api::key::{
|
use pageserver_api::key::{
|
||||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||||
};
|
};
|
||||||
use pageserver_api::keyspace::SparseKeySpace;
|
use pageserver_api::keyspace::SparseKeySpace;
|
||||||
use pageserver_api::models::AuxFilePolicy;
|
use pageserver_api::models::AuxFilePolicy;
|
||||||
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::{debug, info, trace, warn};
|
use tracing::{debug, info, trace, warn};
|
||||||
use utils::bin_ser::DeserializeError;
|
use utils::bin_ser::DeserializeError;
|
||||||
use utils::pausable_failpoint;
|
use utils::pausable_failpoint;
|
||||||
|
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||||
|
|
||||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||||
@@ -172,7 +174,6 @@ impl Timeline {
|
|||||||
pending_deletions: Vec::new(),
|
pending_deletions: Vec::new(),
|
||||||
pending_nblocks: 0,
|
pending_nblocks: 0,
|
||||||
pending_directory_entries: Vec::new(),
|
pending_directory_entries: Vec::new(),
|
||||||
pending_bytes: 0,
|
|
||||||
lsn,
|
lsn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -286,7 +287,10 @@ impl Timeline {
|
|||||||
// then check if the database was already initialized.
|
// then check if the database was already initialized.
|
||||||
// get_rel_exists can be called before dbdir is created.
|
// get_rel_exists can be called before dbdir is created.
|
||||||
let buf = version.get(self, DBDIR_KEY, ctx).await?;
|
let buf = version.get(self, DBDIR_KEY, ctx).await?;
|
||||||
let dbdirs = DbDirectory::des(&buf)?.dbdirs;
|
let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
|
||||||
|
Ok(dir) => Ok(dir.dbdirs),
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}?;
|
||||||
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
|
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
@@ -294,8 +298,13 @@ impl Timeline {
|
|||||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||||
let buf = version.get(self, key, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
let dir = RelDirectory::des(&buf)?;
|
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
|
Ok(dir) => {
|
||||||
|
let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
|
||||||
|
Ok(exists)
|
||||||
|
}
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a list of all existing relations in given tablespace and database.
|
/// Get a list of all existing relations in given tablespace and database.
|
||||||
@@ -314,16 +323,20 @@ impl Timeline {
|
|||||||
let key = rel_dir_to_key(spcnode, dbnode);
|
let key = rel_dir_to_key(spcnode, dbnode);
|
||||||
let buf = version.get(self, key, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
let dir = RelDirectory::des(&buf)?;
|
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||||
let rels: HashSet<RelTag> =
|
Ok(dir) => {
|
||||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
let rels: HashSet<RelTag> =
|
||||||
spcnode,
|
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||||
dbnode,
|
spcnode,
|
||||||
relnode: *relnode,
|
dbnode,
|
||||||
forknum: *forknum,
|
relnode: *relnode,
|
||||||
}));
|
forknum: *forknum,
|
||||||
|
}));
|
||||||
|
|
||||||
Ok(rels)
|
Ok(rels)
|
||||||
|
}
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the whole SLRU segment
|
/// Get the whole SLRU segment
|
||||||
@@ -385,8 +398,13 @@ impl Timeline {
|
|||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
let buf = version.get(self, key, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir.segments.contains(&segno))
|
Ok(dir) => {
|
||||||
|
let exists = dir.segments.contains(&segno);
|
||||||
|
Ok(exists)
|
||||||
|
}
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Locate LSN, such that all transactions that committed before
|
/// Locate LSN, such that all transactions that committed before
|
||||||
@@ -602,7 +620,10 @@ impl Timeline {
|
|||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
|
|
||||||
let buf = version.get(self, key, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
Ok(SlruSegmentDirectory::des(&buf)?.segments)
|
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||||
|
Ok(dir) => Ok(dir.segments),
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn get_relmap_file(
|
pub(crate) async fn get_relmap_file(
|
||||||
@@ -626,7 +647,10 @@ impl Timeline {
|
|||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||||
|
|
||||||
Ok(DbDirectory::des(&buf)?.dbdirs)
|
match DbDirectory::des(&buf).context("deserialization failure") {
|
||||||
|
Ok(dir) => Ok(dir.dbdirs),
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn get_twophase_file(
|
pub(crate) async fn get_twophase_file(
|
||||||
@@ -648,7 +672,10 @@ impl Timeline {
|
|||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
|
||||||
|
|
||||||
Ok(TwoPhaseDirectory::des(&buf)?.xids)
|
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
|
||||||
|
Ok(dir) => Ok(dir.xids),
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn get_control_file(
|
pub(crate) async fn get_control_file(
|
||||||
@@ -673,7 +700,10 @@ impl Timeline {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||||
Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
|
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
|
||||||
|
Ok(dir) => Ok(dir.files),
|
||||||
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
|
},
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// This is expected: historical databases do not have the key.
|
// This is expected: historical databases do not have the key.
|
||||||
debug!("Failed to get info about AUX files: {}", e);
|
debug!("Failed to get info about AUX files: {}", e);
|
||||||
@@ -689,14 +719,13 @@ impl Timeline {
|
|||||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||||
let kv = self
|
let kv = self
|
||||||
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.context("scan")?;
|
||||||
let mut result = HashMap::new();
|
let mut result = HashMap::new();
|
||||||
let mut sz = 0;
|
let mut sz = 0;
|
||||||
for (_, v) in kv {
|
for (_, v) in kv {
|
||||||
let v = v?;
|
let v = v.context("get value")?;
|
||||||
let v = aux_file::decode_file_value_bytes(&v)
|
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
|
||||||
.context("value decode")
|
|
||||||
.map_err(PageReconstructError::Other)?;
|
|
||||||
for (fname, content) in v {
|
for (fname, content) in v {
|
||||||
sz += fname.len();
|
sz += fname.len();
|
||||||
sz += content.len();
|
sz += content.len();
|
||||||
@@ -726,17 +755,7 @@ impl Timeline {
|
|||||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||||
let current_policy = self.last_aux_file_policy.load();
|
let current_policy = self.last_aux_file_policy.load();
|
||||||
match current_policy {
|
match current_policy {
|
||||||
Some(AuxFilePolicy::V1) => {
|
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||||
warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
|
|
||||||
self.list_aux_files_v1(lsn, ctx).await
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
|
||||||
if !res.is_empty() {
|
|
||||||
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
|
||||||
}
|
|
||||||
Ok(res)
|
|
||||||
}
|
|
||||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||||
Some(AuxFilePolicy::CrossValidation) => {
|
Some(AuxFilePolicy::CrossValidation) => {
|
||||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||||
@@ -774,10 +793,11 @@ impl Timeline {
|
|||||||
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
) -> Result<HashMap<RepOriginId, Lsn>, PageReconstructError> {
|
||||||
let kv = self
|
let kv = self
|
||||||
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
.scan(KeySpace::single(repl_origin_key_range()), lsn, ctx)
|
||||||
.await?;
|
.await
|
||||||
|
.context("scan")?;
|
||||||
let mut result = HashMap::new();
|
let mut result = HashMap::new();
|
||||||
for (k, v) in kv {
|
for (k, v) in kv {
|
||||||
let v = v?;
|
let v = v.context("get value")?;
|
||||||
let origin_id = k.field6 as RepOriginId;
|
let origin_id = k.field6 as RepOriginId;
|
||||||
let origin_lsn = Lsn::des(&v).unwrap();
|
let origin_lsn = Lsn::des(&v).unwrap();
|
||||||
if origin_lsn != Lsn::INVALID {
|
if origin_lsn != Lsn::INVALID {
|
||||||
@@ -1031,33 +1051,21 @@ pub struct DatadirModification<'a> {
|
|||||||
// The put-functions add the modifications here, and they are flushed to the
|
// The put-functions add the modifications here, and they are flushed to the
|
||||||
// underlying key-value store by the 'finish' function.
|
// underlying key-value store by the 'finish' function.
|
||||||
pending_lsns: Vec<Lsn>,
|
pending_lsns: Vec<Lsn>,
|
||||||
pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
|
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||||
pending_nblocks: i64,
|
pending_nblocks: i64,
|
||||||
|
|
||||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||||
/// if it was updated in this modification.
|
/// if it was updated in this modification.
|
||||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||||
|
|
||||||
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
|
||||||
pending_bytes: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DatadirModification<'a> {
|
impl<'a> DatadirModification<'a> {
|
||||||
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
|
||||||
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
|
||||||
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
|
|
||||||
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
|
|
||||||
|
|
||||||
/// Get the current lsn
|
/// Get the current lsn
|
||||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||||
self.lsn
|
self.lsn
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
|
||||||
self.pending_bytes
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the current lsn
|
/// Set the current lsn
|
||||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||||
ensure!(
|
ensure!(
|
||||||
@@ -1597,7 +1605,6 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if aux_files_key_v1.is_empty() {
|
if aux_files_key_v1.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
warn!("this timeline is using deprecated aux file policy V1");
|
|
||||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||||
Some(AuxFilePolicy::V1)
|
Some(AuxFilePolicy::V1)
|
||||||
}
|
}
|
||||||
@@ -1726,17 +1733,12 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||||
Err(
|
Err(
|
||||||
e @ (PageReconstructError::Other(_)
|
PageReconstructError::Other(_)
|
||||||
| PageReconstructError::WalRedo(_)
|
| PageReconstructError::WalRedo(_)
|
||||||
| PageReconstructError::MissingKey(_)),
|
| PageReconstructError::MissingKey { .. },
|
||||||
) => {
|
) => {
|
||||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||||
|
|
||||||
if !matches!(e, PageReconstructError::MissingKey(_)) {
|
|
||||||
let e = utils::error::report_compact_sources(&e);
|
|
||||||
tracing::warn!("treating error as if it was a missing key: {}", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut dir = AuxFilesDirectory {
|
let mut dir = AuxFilesDirectory {
|
||||||
files: HashMap::new(),
|
files: HashMap::new(),
|
||||||
};
|
};
|
||||||
@@ -1791,30 +1793,21 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// Flush relation and SLRU data blocks, keep metadata.
|
// Flush relation and SLRU data blocks, keep metadata.
|
||||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||||
for (key, values) in self.pending_updates.drain() {
|
for (key, values) in self.pending_updates.drain() {
|
||||||
if !key.is_valid_key_on_write_path() {
|
for (lsn, value) in values {
|
||||||
bail!(
|
|
||||||
"the request contains data not supported by pageserver at TimelineWriter::put: {}", key
|
|
||||||
);
|
|
||||||
}
|
|
||||||
let mut write_batch = Vec::new();
|
|
||||||
for (lsn, value_ser_size, value) in values {
|
|
||||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||||
// This bails out on first error without modifying pending_updates.
|
// This bails out on first error without modifying pending_updates.
|
||||||
// That's Ok, cf this function's doc comment.
|
// That's Ok, cf this function's doc comment.
|
||||||
write_batch.push((key.to_compact(), lsn, value_ser_size, value));
|
writer.put(key, lsn, &value, ctx).await?;
|
||||||
} else {
|
} else {
|
||||||
retained_pending_updates.entry(key).or_default().push((
|
retained_pending_updates
|
||||||
lsn,
|
.entry(key)
|
||||||
value_ser_size,
|
.or_default()
|
||||||
value,
|
.push((lsn, value));
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
writer.put_batch(write_batch, ctx).await?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
self.pending_updates = retained_pending_updates;
|
self.pending_updates = retained_pending_updates;
|
||||||
self.pending_bytes = 0;
|
|
||||||
|
|
||||||
if pending_nblocks != 0 {
|
if pending_nblocks != 0 {
|
||||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||||
@@ -1840,23 +1833,17 @@ impl<'a> DatadirModification<'a> {
|
|||||||
self.pending_nblocks = 0;
|
self.pending_nblocks = 0;
|
||||||
|
|
||||||
if !self.pending_updates.is_empty() {
|
if !self.pending_updates.is_empty() {
|
||||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
// so we do that first.
|
||||||
// this to do efficient updates to its index.
|
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||||
let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
|
self.pending_updates
|
||||||
.pending_updates
|
.drain()
|
||||||
.drain()
|
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||||
.flat_map(|(key, values)| {
|
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||||
values.into_iter().map(move |(lsn, val_ser_size, value)| {
|
VecMapOrdering::GreaterOrEqual,
|
||||||
if !key.is_valid_key_on_write_path() {
|
);
|
||||||
bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
|
|
||||||
}
|
|
||||||
Ok((key.to_compact(), lsn, val_ser_size, value))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
||||||
|
|
||||||
writer.put_batch(batch, ctx).await?;
|
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if !self.pending_deletions.is_empty() {
|
if !self.pending_deletions.is_empty() {
|
||||||
@@ -1881,8 +1868,6 @@ impl<'a> DatadirModification<'a> {
|
|||||||
writer.update_directory_entries_count(kind, count as u64);
|
writer.update_directory_entries_count(kind, count as u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
self.pending_bytes = 0;
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1899,7 +1884,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// Note: we don't check pending_deletions. It is an error to request a
|
// Note: we don't check pending_deletions. It is an error to request a
|
||||||
// value that has been removed, deletion only avoids leaking storage.
|
// value that has been removed, deletion only avoids leaking storage.
|
||||||
if let Some(values) = self.pending_updates.get(&key) {
|
if let Some(values) = self.pending_updates.get(&key) {
|
||||||
if let Some((_, _, value)) = values.last() {
|
if let Some((_, value)) = values.last() {
|
||||||
return if let Value::Image(img) = value {
|
return if let Value::Image(img) = value {
|
||||||
Ok(img.clone())
|
Ok(img.clone())
|
||||||
} else {
|
} else {
|
||||||
@@ -1908,7 +1893,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// work directly with Images, and we never need to read actual
|
// work directly with Images, and we never need to read actual
|
||||||
// data pages. We could handle this if we had to, by calling
|
// data pages. We could handle this if we had to, by calling
|
||||||
// the walredo manager, but let's keep it simple for now.
|
// the walredo manager, but let's keep it simple for now.
|
||||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||||
"unexpected pending WAL record"
|
"unexpected pending WAL record"
|
||||||
)))
|
)))
|
||||||
};
|
};
|
||||||
@@ -1927,17 +1912,13 @@ impl<'a> DatadirModification<'a> {
|
|||||||
fn put(&mut self, key: Key, val: Value) {
|
fn put(&mut self, key: Key, val: Value) {
|
||||||
let values = self.pending_updates.entry(key).or_default();
|
let values = self.pending_updates.entry(key).or_default();
|
||||||
// Replace the previous value if it exists at the same lsn
|
// Replace the previous value if it exists at the same lsn
|
||||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||||
if *last_lsn == self.lsn {
|
if *last_lsn == self.lsn {
|
||||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
|
||||||
*last_value = val;
|
*last_value = val;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
values.push((self.lsn, val));
|
||||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
|
||||||
self.pending_bytes += val_serialized_size;
|
|
||||||
values.push((self.lsn, val_serialized_size, val));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete(&mut self, key_range: Range<Key>) {
|
fn delete(&mut self, key_range: Range<Key>) {
|
||||||
@@ -1982,23 +1963,23 @@ impl<'a> Version<'a> {
|
|||||||
//--- Metadata structs stored in key-value pairs in the repository.
|
//--- Metadata structs stored in key-value pairs in the repository.
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct DbDirectory {
|
struct DbDirectory {
|
||||||
// (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
|
// (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
|
||||||
pub dbdirs: HashMap<(Oid, Oid), bool>,
|
dbdirs: HashMap<(Oid, Oid), bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub(crate) struct TwoPhaseDirectory {
|
struct TwoPhaseDirectory {
|
||||||
pub(crate) xids: HashSet<TransactionId>,
|
xids: HashSet<TransactionId>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||||
pub struct RelDirectory {
|
struct RelDirectory {
|
||||||
// Set of relations that exist. (relfilenode, forknum)
|
// Set of relations that exist. (relfilenode, forknum)
|
||||||
//
|
//
|
||||||
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
||||||
// key-value pairs, if you have a lot of relations
|
// key-value pairs, if you have a lot of relations
|
||||||
pub rels: HashSet<(Oid, u8)>,
|
rels: HashSet<(Oid, u8)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
|
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||||
@@ -2022,9 +2003,9 @@ struct RelSizeEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||||
pub(crate) struct SlruSegmentDirectory {
|
struct SlruSegmentDirectory {
|
||||||
// Set of SLRU segments that exist.
|
// Set of SLRU segments that exist.
|
||||||
pub(crate) segments: HashSet<u32>,
|
segments: HashSet<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||||
@@ -2067,7 +2048,7 @@ mod tests {
|
|||||||
|
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
let tline = tenant
|
let tline = tenant
|
||||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||||
.await?;
|
.await?;
|
||||||
let tline = tline.raw_timeline().unwrap();
|
let tline = tline.raw_timeline().unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -146,12 +146,6 @@ impl FromStr for TokioRuntimeMode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
|
|
||||||
env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
|
|
||||||
// the default 2MiB are insufficent, especially in debug mode
|
|
||||||
.unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
|
|
||||||
});
|
|
||||||
|
|
||||||
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||||
let thread_name = "pageserver-tokio";
|
let thread_name = "pageserver-tokio";
|
||||||
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
||||||
@@ -170,7 +164,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
|||||||
tokio::runtime::Builder::new_current_thread()
|
tokio::runtime::Builder::new_current_thread()
|
||||||
.thread_name(thread_name)
|
.thread_name(thread_name)
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
|
||||||
.build()
|
.build()
|
||||||
.expect("failed to create one single runtime")
|
.expect("failed to create one single runtime")
|
||||||
}
|
}
|
||||||
@@ -180,7 +173,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
|||||||
.thread_name(thread_name)
|
.thread_name(thread_name)
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.worker_threads(num_workers.get())
|
.worker_threads(num_workers.get())
|
||||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
|
||||||
.build()
|
.build()
|
||||||
.expect("failed to create one multi-threaded runtime")
|
.expect("failed to create one multi-threaded runtime")
|
||||||
}
|
}
|
||||||
@@ -207,7 +199,6 @@ macro_rules! pageserver_runtime {
|
|||||||
.thread_name($name)
|
.thread_name($name)
|
||||||
.worker_threads(TOKIO_WORKER_THREADS.get())
|
.worker_threads(TOKIO_WORKER_THREADS.get())
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
|
||||||
.build()
|
.build()
|
||||||
.expect(std::concat!("Failed to create runtime ", $name))
|
.expect(std::concat!("Failed to create runtime ", $name))
|
||||||
});
|
});
|
||||||
@@ -402,7 +393,7 @@ struct PageServerTask {
|
|||||||
|
|
||||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: Option<TenantShardId>,
|
||||||
timeline_id: Option<TimelineId>,
|
timeline_id: Option<TimelineId>,
|
||||||
|
|
||||||
mutable: Mutex<MutableTaskState>,
|
mutable: Mutex<MutableTaskState>,
|
||||||
@@ -414,7 +405,7 @@ struct PageServerTask {
|
|||||||
pub fn spawn<F>(
|
pub fn spawn<F>(
|
||||||
runtime: &tokio::runtime::Handle,
|
runtime: &tokio::runtime::Handle,
|
||||||
kind: TaskKind,
|
kind: TaskKind,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: Option<TenantShardId>,
|
||||||
timeline_id: Option<TimelineId>,
|
timeline_id: Option<TimelineId>,
|
||||||
name: &str,
|
name: &str,
|
||||||
future: F,
|
future: F,
|
||||||
@@ -559,7 +550,7 @@ pub async fn shutdown_tasks(
|
|||||||
let tasks = TASKS.lock().unwrap();
|
let tasks = TASKS.lock().unwrap();
|
||||||
for task in tasks.values() {
|
for task in tasks.values() {
|
||||||
if (kind.is_none() || Some(task.kind) == kind)
|
if (kind.is_none() || Some(task.kind) == kind)
|
||||||
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
|
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||||
{
|
{
|
||||||
task.cancel.cancel();
|
task.cancel.cancel();
|
||||||
@@ -582,8 +573,13 @@ pub async fn shutdown_tasks(
|
|||||||
};
|
};
|
||||||
if let Some(mut join_handle) = join_handle {
|
if let Some(mut join_handle) = join_handle {
|
||||||
if log_all {
|
if log_all {
|
||||||
// warn to catch these in tests; there shouldn't be any
|
if tenant_shard_id.is_none() {
|
||||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
// there are quite few of these
|
||||||
|
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||||
|
} else {
|
||||||
|
// warn to catch these in tests; there shouldn't be any
|
||||||
|
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ use tokio::sync::watch;
|
|||||||
use tokio::task::JoinSet;
|
use tokio::task::JoinSet;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use upload_queue::NotInitialized;
|
|
||||||
use utils::backoff;
|
use utils::backoff;
|
||||||
use utils::circuit_breaker::CircuitBreaker;
|
use utils::circuit_breaker::CircuitBreaker;
|
||||||
use utils::completion;
|
use utils::completion;
|
||||||
@@ -302,11 +301,7 @@ pub struct Tenant {
|
|||||||
pub(crate) timeline_get_throttle:
|
pub(crate) timeline_get_throttle:
|
||||||
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
|
||||||
|
|
||||||
/// An ongoing timeline detach concurrency limiter.
|
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||||
///
|
|
||||||
/// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
|
|
||||||
/// to have two running at the same time. A different one can be started if an earlier one
|
|
||||||
/// has failed for whatever reason.
|
|
||||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||||
|
|
||||||
/// `index_part.json` based gc blocking reason tracking.
|
/// `index_part.json` based gc blocking reason tracking.
|
||||||
@@ -501,42 +496,6 @@ impl Debug for DeleteTimelineError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(thiserror::Error)]
|
|
||||||
pub enum TimelineArchivalError {
|
|
||||||
#[error("NotFound")]
|
|
||||||
NotFound,
|
|
||||||
|
|
||||||
#[error("Timeout")]
|
|
||||||
Timeout,
|
|
||||||
|
|
||||||
#[error("ancestor is archived: {}", .0)]
|
|
||||||
HasArchivedParent(TimelineId),
|
|
||||||
|
|
||||||
#[error("HasUnarchivedChildren")]
|
|
||||||
HasUnarchivedChildren(Vec<TimelineId>),
|
|
||||||
|
|
||||||
#[error("Timeline archival is already in progress")]
|
|
||||||
AlreadyInProgress,
|
|
||||||
|
|
||||||
#[error(transparent)]
|
|
||||||
Other(#[from] anyhow::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Debug for TimelineArchivalError {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
match self {
|
|
||||||
Self::NotFound => write!(f, "NotFound"),
|
|
||||||
Self::Timeout => write!(f, "Timeout"),
|
|
||||||
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
|
|
||||||
Self::HasUnarchivedChildren(c) => {
|
|
||||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
|
||||||
}
|
|
||||||
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
|
|
||||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub enum SetStoppingError {
|
pub enum SetStoppingError {
|
||||||
AlreadyStopping(completion::Barrier),
|
AlreadyStopping(completion::Barrier),
|
||||||
Broken,
|
Broken,
|
||||||
@@ -642,15 +601,6 @@ impl From<PageReconstructError> for GcError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<NotInitialized> for GcError {
|
|
||||||
fn from(value: NotInitialized) -> Self {
|
|
||||||
match value {
|
|
||||||
NotInitialized::Uninitialized => GcError::Remote(value.into()),
|
|
||||||
NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<timeline::layer_manager::Shutdown> for GcError {
|
impl From<timeline::layer_manager::Shutdown> for GcError {
|
||||||
fn from(_: timeline::layer_manager::Shutdown) -> Self {
|
fn from(_: timeline::layer_manager::Shutdown) -> Self {
|
||||||
GcError::TimelineCancelled
|
GcError::TimelineCancelled
|
||||||
@@ -834,7 +784,7 @@ impl Tenant {
|
|||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
&tokio::runtime::Handle::current(),
|
&tokio::runtime::Handle::current(),
|
||||||
TaskKind::Attach,
|
TaskKind::Attach,
|
||||||
tenant_shard_id,
|
Some(tenant_shard_id),
|
||||||
None,
|
None,
|
||||||
"attach tenant",
|
"attach tenant",
|
||||||
async move {
|
async move {
|
||||||
@@ -873,20 +823,14 @@ impl Tenant {
|
|||||||
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
// The Stopping case is for when we have passed control on to DeleteTenantFlow:
|
||||||
// if it errors, we will call make_broken when tenant is already in Stopping.
|
// if it errors, we will call make_broken when tenant is already in Stopping.
|
||||||
assert!(
|
assert!(
|
||||||
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
|
matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
|
||||||
"the attach task owns the tenant state until activation is complete"
|
"the attach task owns the tenant state until activation is complete"
|
||||||
);
|
);
|
||||||
|
|
||||||
*state = TenantState::broken_from_reason(err.to_string());
|
*state = TenantState::broken_from_reason(err.to_string());
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: should also be rejecting tenant conf changes that violate this check.
|
|
||||||
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
|
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut init_order = init_order;
|
let mut init_order = init_order;
|
||||||
// take the completion because initial tenant loading will complete when all of
|
// take the completion because initial tenant loading will complete when all of
|
||||||
// these tasks complete.
|
// these tasks complete.
|
||||||
@@ -1368,59 +1312,24 @@ impl Tenant {
|
|||||||
&self,
|
&self,
|
||||||
timeline_id: TimelineId,
|
timeline_id: TimelineId,
|
||||||
state: TimelineArchivalState,
|
state: TimelineArchivalState,
|
||||||
) -> Result<(), TimelineArchivalError> {
|
) -> anyhow::Result<()> {
|
||||||
info!("setting timeline archival config");
|
let timeline = self
|
||||||
let timeline = {
|
.get_timeline(timeline_id, false)
|
||||||
let timelines = self.timelines.lock().unwrap();
|
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
||||||
|
|
||||||
let Some(timeline) = timelines.get(&timeline_id) else {
|
|
||||||
return Err(TimelineArchivalError::NotFound);
|
|
||||||
};
|
|
||||||
|
|
||||||
if state == TimelineArchivalState::Unarchived {
|
|
||||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
|
|
||||||
if ancestor_timeline.is_archived() == Some(true) {
|
|
||||||
return Err(TimelineArchivalError::HasArchivedParent(
|
|
||||||
ancestor_timeline.timeline_id,
|
|
||||||
));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure that there are no non-archived child timelines
|
|
||||||
let children: Vec<TimelineId> = timelines
|
|
||||||
.iter()
|
|
||||||
.filter_map(|(id, entry)| {
|
|
||||||
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
if entry.is_archived() == Some(true) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
Some(*id)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if !children.is_empty() && state == TimelineArchivalState::Archived {
|
|
||||||
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
|
|
||||||
}
|
|
||||||
Arc::clone(timeline)
|
|
||||||
};
|
|
||||||
|
|
||||||
let upload_needed = timeline
|
let upload_needed = timeline
|
||||||
.remote_client
|
.remote_client
|
||||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||||
|
|
||||||
if upload_needed {
|
if upload_needed {
|
||||||
info!("Uploading new state");
|
|
||||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||||
let Ok(v) =
|
let Ok(v) =
|
||||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||||
else {
|
else {
|
||||||
tracing::warn!("reached timeout for waiting on upload queue");
|
tracing::warn!("reached timeout for waiting on upload queue");
|
||||||
return Err(TimelineArchivalError::Timeout);
|
bail!("reached timeout for upload queue flush");
|
||||||
};
|
};
|
||||||
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
|
v?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -3813,27 +3722,6 @@ impl Tenant {
|
|||||||
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||||
self.tenant_conf.load().tenant_conf.clone()
|
self.tenant_conf.load().tenant_conf.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// How much local storage would this tenant like to have? It can cope with
|
|
||||||
/// less than this (via eviction and on-demand downloads), but this function enables
|
|
||||||
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
|
||||||
/// by keeping important things on local disk.
|
|
||||||
///
|
|
||||||
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
|
|
||||||
/// than they report here, due to layer eviction. Tenants with many active branches may
|
|
||||||
/// actually use more than they report here.
|
|
||||||
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
|
||||||
let timelines = self.timelines.lock().unwrap();
|
|
||||||
|
|
||||||
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
|
|
||||||
// reflects the observation that on tenants with multiple large branches, typically only one
|
|
||||||
// of them is used actively enough to occupy space on disk.
|
|
||||||
timelines
|
|
||||||
.values()
|
|
||||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
|
||||||
.max()
|
|
||||||
.unwrap_or(0)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||||
@@ -4576,13 +4464,10 @@ mod tests {
|
|||||||
|
|
||||||
// This needs to traverse to the parent, and fails.
|
// This needs to traverse to the parent, and fails.
|
||||||
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
|
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
|
||||||
assert!(
|
assert!(err.to_string().starts_with(&format!(
|
||||||
err.to_string().starts_with(&format!(
|
"Bad state on timeline {}: Broken",
|
||||||
"bad state on timeline {}: Broken",
|
tline.timeline_id
|
||||||
tline.timeline_id
|
)));
|
||||||
)),
|
|
||||||
"{err}"
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -6017,10 +5902,10 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// the default aux file policy to switch is v2 if not set by the admins
|
// the default aux file policy to switch is v1 if not set by the admins
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
harness.tenant_conf.switch_aux_file_policy,
|
harness.tenant_conf.switch_aux_file_policy,
|
||||||
AuxFilePolicy::default_tenant_config()
|
AuxFilePolicy::V1
|
||||||
);
|
);
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
|
|
||||||
@@ -6064,8 +5949,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.last_aux_file_policy.load(),
|
tline.last_aux_file_policy.load(),
|
||||||
Some(AuxFilePolicy::V2),
|
Some(AuxFilePolicy::V1),
|
||||||
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
||||||
);
|
);
|
||||||
|
|
||||||
// we can read everything from the storage
|
// we can read everything from the storage
|
||||||
@@ -6087,8 +5972,8 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.last_aux_file_policy.load(),
|
tline.last_aux_file_policy.load(),
|
||||||
Some(AuxFilePolicy::V2),
|
Some(AuxFilePolicy::V1),
|
||||||
"keep v2 storage format when new files are written"
|
"keep v1 storage format when new files are written"
|
||||||
);
|
);
|
||||||
|
|
||||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||||
@@ -6104,7 +5989,7 @@ mod tests {
|
|||||||
|
|
||||||
// child copies the last flag even if that is not on remote storage yet
|
// child copies the last flag even if that is not on remote storage yet
|
||||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
||||||
|
|
||||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||||
@@ -7090,14 +6975,18 @@ mod tests {
|
|||||||
vec![
|
vec![
|
||||||
// Image layer at GC horizon
|
// Image layer at GC horizon
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
key_range: {
|
||||||
|
let mut key = Key::MAX;
|
||||||
|
key.field6 -= 1;
|
||||||
|
Key::MIN..key
|
||||||
|
},
|
||||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||||
is_delta: false
|
is_delta: false
|
||||||
},
|
},
|
||||||
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
|
// The delta layer that is cut in the middle
|
||||||
PersistentLayerKey {
|
PersistentLayerKey {
|
||||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
key_range: get_key(3)..get_key(4),
|
||||||
lsn_range: Lsn(0x30)..Lsn(0x48),
|
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||||
is_delta: true
|
is_delta: true
|
||||||
},
|
},
|
||||||
// The delta3 layer that should not be picked for the compaction
|
// The delta3 layer that should not be picked for the compaction
|
||||||
@@ -8077,214 +7966,6 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
|
||||||
{
|
|
||||||
let harness =
|
|
||||||
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
|
|
||||||
.await?;
|
|
||||||
let (tenant, ctx) = harness.load().await;
|
|
||||||
|
|
||||||
fn get_key(id: u32) -> Key {
|
|
||||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
|
||||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
|
||||||
key.field6 = id;
|
|
||||||
key
|
|
||||||
}
|
|
||||||
|
|
||||||
let img_layer = (0..10)
|
|
||||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
|
||||||
.collect_vec();
|
|
||||||
|
|
||||||
let delta1 = vec![
|
|
||||||
(
|
|
||||||
get_key(1),
|
|
||||||
Lsn(0x20),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
get_key(1),
|
|
||||||
Lsn(0x28),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
|
||||||
),
|
|
||||||
];
|
|
||||||
let delta2 = vec![
|
|
||||||
(
|
|
||||||
get_key(1),
|
|
||||||
Lsn(0x30),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
get_key(1),
|
|
||||||
Lsn(0x38),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
|
||||||
),
|
|
||||||
];
|
|
||||||
let delta3 = vec![
|
|
||||||
(
|
|
||||||
get_key(8),
|
|
||||||
Lsn(0x48),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
get_key(9),
|
|
||||||
Lsn(0x48),
|
|
||||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
|
||||||
),
|
|
||||||
];
|
|
||||||
|
|
||||||
let tline = tenant
|
|
||||||
.create_test_timeline_with_layers(
|
|
||||||
TIMELINE_ID,
|
|
||||||
Lsn(0x10),
|
|
||||||
DEFAULT_PG_VERSION,
|
|
||||||
&ctx,
|
|
||||||
vec![
|
|
||||||
// delta1 and delta 2 only contain a single key but multiple updates
|
|
||||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
|
||||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
|
||||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
|
|
||||||
], // delta layers
|
|
||||||
vec![(Lsn(0x10), img_layer)], // image layers
|
|
||||||
Lsn(0x50),
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
{
|
|
||||||
// Update GC info
|
|
||||||
let mut guard = tline.gc_info.write().unwrap();
|
|
||||||
*guard = GcInfo {
|
|
||||||
retain_lsns: vec![
|
|
||||||
(Lsn(0x10), tline.timeline_id),
|
|
||||||
(Lsn(0x20), tline.timeline_id),
|
|
||||||
],
|
|
||||||
cutoffs: GcCutoffs {
|
|
||||||
time: Lsn(0x30),
|
|
||||||
space: Lsn(0x30),
|
|
||||||
},
|
|
||||||
leases: Default::default(),
|
|
||||||
within_ancestor_pitr: false,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let expected_result = [
|
|
||||||
Bytes::from_static(b"value 0@0x10"),
|
|
||||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
|
||||||
Bytes::from_static(b"value 2@0x10"),
|
|
||||||
Bytes::from_static(b"value 3@0x10"),
|
|
||||||
Bytes::from_static(b"value 4@0x10"),
|
|
||||||
Bytes::from_static(b"value 5@0x10"),
|
|
||||||
Bytes::from_static(b"value 6@0x10"),
|
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
|
||||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
|
||||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let expected_result_at_gc_horizon = [
|
|
||||||
Bytes::from_static(b"value 0@0x10"),
|
|
||||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
|
||||||
Bytes::from_static(b"value 2@0x10"),
|
|
||||||
Bytes::from_static(b"value 3@0x10"),
|
|
||||||
Bytes::from_static(b"value 4@0x10"),
|
|
||||||
Bytes::from_static(b"value 5@0x10"),
|
|
||||||
Bytes::from_static(b"value 6@0x10"),
|
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
|
||||||
Bytes::from_static(b"value 8@0x10"),
|
|
||||||
Bytes::from_static(b"value 9@0x10"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let expected_result_at_lsn_20 = [
|
|
||||||
Bytes::from_static(b"value 0@0x10"),
|
|
||||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
|
||||||
Bytes::from_static(b"value 2@0x10"),
|
|
||||||
Bytes::from_static(b"value 3@0x10"),
|
|
||||||
Bytes::from_static(b"value 4@0x10"),
|
|
||||||
Bytes::from_static(b"value 5@0x10"),
|
|
||||||
Bytes::from_static(b"value 6@0x10"),
|
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
|
||||||
Bytes::from_static(b"value 8@0x10"),
|
|
||||||
Bytes::from_static(b"value 9@0x10"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let expected_result_at_lsn_10 = [
|
|
||||||
Bytes::from_static(b"value 0@0x10"),
|
|
||||||
Bytes::from_static(b"value 1@0x10"),
|
|
||||||
Bytes::from_static(b"value 2@0x10"),
|
|
||||||
Bytes::from_static(b"value 3@0x10"),
|
|
||||||
Bytes::from_static(b"value 4@0x10"),
|
|
||||||
Bytes::from_static(b"value 5@0x10"),
|
|
||||||
Bytes::from_static(b"value 6@0x10"),
|
|
||||||
Bytes::from_static(b"value 7@0x10"),
|
|
||||||
Bytes::from_static(b"value 8@0x10"),
|
|
||||||
Bytes::from_static(b"value 9@0x10"),
|
|
||||||
];
|
|
||||||
|
|
||||||
let verify_result = || async {
|
|
||||||
let gc_horizon = {
|
|
||||||
let gc_info = tline.gc_info.read().unwrap();
|
|
||||||
gc_info.cutoffs.time
|
|
||||||
};
|
|
||||||
for idx in 0..10 {
|
|
||||||
assert_eq!(
|
|
||||||
tline
|
|
||||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
&expected_result[idx]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
tline
|
|
||||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
&expected_result_at_gc_horizon[idx]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
tline
|
|
||||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
&expected_result_at_lsn_20[idx]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
tline
|
|
||||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
&expected_result_at_lsn_10[idx]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
verify_result().await;
|
|
||||||
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
let mut dryrun_flags = EnumSet::new();
|
|
||||||
dryrun_flags.insert(CompactFlags::DryRun);
|
|
||||||
|
|
||||||
tline
|
|
||||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
|
||||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
|
||||||
verify_result().await;
|
|
||||||
|
|
||||||
tline
|
|
||||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
verify_result().await;
|
|
||||||
|
|
||||||
// compact again
|
|
||||||
tline
|
|
||||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
verify_result().await;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ use tracing::warn;
|
|||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache::PAGE_SZ;
|
use crate::page_cache::PAGE_SZ;
|
||||||
use crate::tenant::block_io::BlockCursor;
|
use crate::tenant::block_io::BlockCursor;
|
||||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
|
||||||
use crate::virtual_file::VirtualFile;
|
use crate::virtual_file::VirtualFile;
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::io::{Error, ErrorKind};
|
use std::io::{Error, ErrorKind};
|
||||||
@@ -148,7 +147,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
|||||||
|
|
||||||
/// The maximum size of blobs we support. The highest few bits
|
/// The maximum size of blobs we support. The highest few bits
|
||||||
/// are reserved for compression and other further uses.
|
/// are reserved for compression and other further uses.
|
||||||
pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
|
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||||
|
|
||||||
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||||
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||||
@@ -187,11 +186,11 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
/// You need to make sure that the internal buffer is empty, otherwise
|
/// You need to make sure that the internal buffer is empty, otherwise
|
||||||
/// data will be written in wrong order.
|
/// data will be written in wrong order.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
async fn write_all_unbuffered<Buf: IoBuf + Send>(
|
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||||
&mut self,
|
&mut self,
|
||||||
src_buf: FullSlice<Buf>,
|
src_buf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
) -> (B::Buf, Result<(), Error>) {
|
||||||
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
|
let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
|
||||||
let nbytes = match res {
|
let nbytes = match res {
|
||||||
Ok(nbytes) => nbytes,
|
Ok(nbytes) => nbytes,
|
||||||
@@ -205,9 +204,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||||
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
|
pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
|
||||||
let buf = std::mem::take(&mut self.buf);
|
let buf = std::mem::take(&mut self.buf);
|
||||||
let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
|
let (mut buf, res) = self.inner.write_all(buf, ctx).await;
|
||||||
res?;
|
res?;
|
||||||
let mut buf = slice.into_raw_slice().into_inner();
|
|
||||||
buf.clear();
|
buf.clear();
|
||||||
self.buf = buf;
|
self.buf = buf;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -224,30 +222,19 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Internal, possibly buffered, write function
|
/// Internal, possibly buffered, write function
|
||||||
async fn write_all<Buf: IoBuf + Send>(
|
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||||
&mut self,
|
&mut self,
|
||||||
src_buf: FullSlice<Buf>,
|
src_buf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> (FullSlice<Buf>, Result<(), Error>) {
|
) -> (B::Buf, Result<(), Error>) {
|
||||||
let src_buf = src_buf.into_raw_slice();
|
|
||||||
let src_buf_bounds = src_buf.bounds();
|
|
||||||
let restore = move |src_buf_slice: Slice<_>| {
|
|
||||||
FullSlice::must_new(Slice::from_buf_bounds(
|
|
||||||
src_buf_slice.into_inner(),
|
|
||||||
src_buf_bounds,
|
|
||||||
))
|
|
||||||
};
|
|
||||||
|
|
||||||
if !BUFFERED {
|
if !BUFFERED {
|
||||||
assert!(self.buf.is_empty());
|
assert!(self.buf.is_empty());
|
||||||
return self
|
return self.write_all_unbuffered(src_buf, ctx).await;
|
||||||
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
|
|
||||||
.await;
|
|
||||||
}
|
}
|
||||||
let remaining = Self::CAPACITY - self.buf.len();
|
let remaining = Self::CAPACITY - self.buf.len();
|
||||||
let src_buf_len = src_buf.bytes_init();
|
let src_buf_len = src_buf.bytes_init();
|
||||||
if src_buf_len == 0 {
|
if src_buf_len == 0 {
|
||||||
return (restore(src_buf), Ok(()));
|
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
|
||||||
}
|
}
|
||||||
let mut src_buf = src_buf.slice(0..src_buf_len);
|
let mut src_buf = src_buf.slice(0..src_buf_len);
|
||||||
// First try to copy as much as we can into the buffer
|
// First try to copy as much as we can into the buffer
|
||||||
@@ -258,7 +245,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
// Then, if the buffer is full, flush it out
|
// Then, if the buffer is full, flush it out
|
||||||
if self.buf.len() == Self::CAPACITY {
|
if self.buf.len() == Self::CAPACITY {
|
||||||
if let Err(e) = self.flush_buffer(ctx).await {
|
if let Err(e) = self.flush_buffer(ctx).await {
|
||||||
return (restore(src_buf), Err(e));
|
return (Slice::into_inner(src_buf), Err(e));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Finally, write the tail of src_buf:
|
// Finally, write the tail of src_buf:
|
||||||
@@ -271,29 +258,27 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
let copied = self.write_into_buffer(&src_buf);
|
let copied = self.write_into_buffer(&src_buf);
|
||||||
// We just verified above that src_buf fits into our internal buffer.
|
// We just verified above that src_buf fits into our internal buffer.
|
||||||
assert_eq!(copied, src_buf.len());
|
assert_eq!(copied, src_buf.len());
|
||||||
restore(src_buf)
|
Slice::into_inner(src_buf)
|
||||||
} else {
|
} else {
|
||||||
let (src_buf, res) = self
|
let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
|
||||||
.write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
|
|
||||||
.await;
|
|
||||||
if let Err(e) = res {
|
if let Err(e) = res {
|
||||||
return (src_buf, Err(e));
|
return (src_buf, Err(e));
|
||||||
}
|
}
|
||||||
src_buf
|
src_buf
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
restore(src_buf)
|
Slice::into_inner(src_buf)
|
||||||
};
|
};
|
||||||
(src_buf, Ok(()))
|
(src_buf, Ok(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write a blob of data. Returns the offset that it was written to,
|
/// Write a blob of data. Returns the offset that it was written to,
|
||||||
/// which can be used to retrieve the data later.
|
/// which can be used to retrieve the data later.
|
||||||
pub async fn write_blob<Buf: IoBuf + Send>(
|
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||||
&mut self,
|
&mut self,
|
||||||
srcbuf: FullSlice<Buf>,
|
srcbuf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> (FullSlice<Buf>, Result<u64, Error>) {
|
) -> (B::Buf, Result<u64, Error>) {
|
||||||
let (buf, res) = self
|
let (buf, res) = self
|
||||||
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
|
||||||
.await;
|
.await;
|
||||||
@@ -302,40 +287,43 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
|
|
||||||
/// Write a blob of data. Returns the offset that it was written to,
|
/// Write a blob of data. Returns the offset that it was written to,
|
||||||
/// which can be used to retrieve the data later.
|
/// which can be used to retrieve the data later.
|
||||||
pub(crate) async fn write_blob_maybe_compressed<Buf: IoBuf + Send>(
|
pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||||
&mut self,
|
&mut self,
|
||||||
srcbuf: FullSlice<Buf>,
|
srcbuf: B,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
algorithm: ImageCompressionAlgorithm,
|
algorithm: ImageCompressionAlgorithm,
|
||||||
) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
|
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
|
||||||
let offset = self.offset;
|
let offset = self.offset;
|
||||||
let mut compression_info = CompressionInfo {
|
let mut compression_info = CompressionInfo {
|
||||||
written_compressed: false,
|
written_compressed: false,
|
||||||
compressed_size: None,
|
compressed_size: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let len = srcbuf.len();
|
let len = srcbuf.bytes_init();
|
||||||
|
|
||||||
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||||
io_buf.clear();
|
io_buf.clear();
|
||||||
let mut compressed_buf = None;
|
let mut compressed_buf = None;
|
||||||
let ((io_buf_slice, hdr_res), srcbuf) = async {
|
let ((io_buf, hdr_res), srcbuf) = async {
|
||||||
if len < 128 {
|
if len < 128 {
|
||||||
// Short blob. Write a 1-byte length header
|
// Short blob. Write a 1-byte length header
|
||||||
io_buf.put_u8(len as u8);
|
io_buf.put_u8(len as u8);
|
||||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
(
|
||||||
|
self.write_all(io_buf, ctx).await,
|
||||||
|
srcbuf.slice_full().into_inner(),
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
// Write a 4-byte length header
|
// Write a 4-byte length header
|
||||||
if len > MAX_SUPPORTED_BLOB_LEN {
|
if len > MAX_SUPPORTED_LEN {
|
||||||
return (
|
return (
|
||||||
(
|
(
|
||||||
io_buf.slice_len(),
|
io_buf,
|
||||||
Err(Error::new(
|
Err(Error::new(
|
||||||
ErrorKind::Other,
|
ErrorKind::Other,
|
||||||
format!("blob too large ({len} bytes)"),
|
format!("blob too large ({len} bytes)"),
|
||||||
)),
|
)),
|
||||||
),
|
),
|
||||||
srcbuf,
|
srcbuf.slice_full().into_inner(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let (high_bit_mask, len_written, srcbuf) = match algorithm {
|
let (high_bit_mask, len_written, srcbuf) = match algorithm {
|
||||||
@@ -348,7 +336,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
} else {
|
} else {
|
||||||
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
|
async_compression::tokio::write::ZstdEncoder::new(Vec::new())
|
||||||
};
|
};
|
||||||
encoder.write_all(&srcbuf[..]).await.unwrap();
|
let slice = srcbuf.slice_full();
|
||||||
|
encoder.write_all(&slice[..]).await.unwrap();
|
||||||
encoder.shutdown().await.unwrap();
|
encoder.shutdown().await.unwrap();
|
||||||
let compressed = encoder.into_inner();
|
let compressed = encoder.into_inner();
|
||||||
compression_info.compressed_size = Some(compressed.len());
|
compression_info.compressed_size = Some(compressed.len());
|
||||||
@@ -356,29 +345,31 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
compression_info.written_compressed = true;
|
compression_info.written_compressed = true;
|
||||||
let compressed_len = compressed.len();
|
let compressed_len = compressed.len();
|
||||||
compressed_buf = Some(compressed);
|
compressed_buf = Some(compressed);
|
||||||
(BYTE_ZSTD, compressed_len, srcbuf)
|
(BYTE_ZSTD, compressed_len, slice.into_inner())
|
||||||
} else {
|
} else {
|
||||||
(BYTE_UNCOMPRESSED, len, srcbuf)
|
(BYTE_UNCOMPRESSED, len, slice.into_inner())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ImageCompressionAlgorithm::Disabled => (BYTE_UNCOMPRESSED, len, srcbuf),
|
ImageCompressionAlgorithm::Disabled => {
|
||||||
|
(BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let mut len_buf = (len_written as u32).to_be_bytes();
|
let mut len_buf = (len_written as u32).to_be_bytes();
|
||||||
assert_eq!(len_buf[0] & 0xf0, 0);
|
assert_eq!(len_buf[0] & 0xf0, 0);
|
||||||
len_buf[0] |= high_bit_mask;
|
len_buf[0] |= high_bit_mask;
|
||||||
io_buf.extend_from_slice(&len_buf[..]);
|
io_buf.extend_from_slice(&len_buf[..]);
|
||||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
(self.write_all(io_buf, ctx).await, srcbuf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.await;
|
.await;
|
||||||
self.io_buf = Some(io_buf_slice.into_raw_slice().into_inner());
|
self.io_buf = Some(io_buf);
|
||||||
match hdr_res {
|
match hdr_res {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(e) => return (srcbuf, Err(e)),
|
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||||
}
|
}
|
||||||
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
|
let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
|
||||||
let (_buf, res) = self.write_all(compressed_buf.slice_len(), ctx).await;
|
let (_buf, res) = self.write_all(compressed_buf, ctx).await;
|
||||||
(srcbuf, res)
|
(Slice::into_inner(srcbuf.slice(..)), res)
|
||||||
} else {
|
} else {
|
||||||
self.write_all(srcbuf, ctx).await
|
self.write_all(srcbuf, ctx).await
|
||||||
};
|
};
|
||||||
@@ -441,21 +432,21 @@ pub(crate) mod tests {
|
|||||||
let (_, res) = if compression {
|
let (_, res) = if compression {
|
||||||
let res = wtr
|
let res = wtr
|
||||||
.write_blob_maybe_compressed(
|
.write_blob_maybe_compressed(
|
||||||
blob.clone().slice_len(),
|
blob.clone(),
|
||||||
ctx,
|
ctx,
|
||||||
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
ImageCompressionAlgorithm::Zstd { level: Some(1) },
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
(res.0, res.1.map(|(off, _)| off))
|
(res.0, res.1.map(|(off, _)| off))
|
||||||
} else {
|
} else {
|
||||||
wtr.write_blob(blob.clone().slice_len(), ctx).await
|
wtr.write_blob(blob.clone(), ctx).await
|
||||||
};
|
};
|
||||||
let offs = res?;
|
let offs = res?;
|
||||||
offsets.push(offs);
|
offsets.push(offs);
|
||||||
}
|
}
|
||||||
// Write out one page worth of zeros so that we can
|
// Write out one page worth of zeros so that we can
|
||||||
// read again with read_blk
|
// read again with read_blk
|
||||||
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
|
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
|
||||||
let offs = res?;
|
let offs = res?;
|
||||||
println!("Writing final blob at offs={offs}");
|
println!("Writing final blob at offs={offs}");
|
||||||
wtr.flush_buffer(ctx).await?;
|
wtr.flush_buffer(ctx).await?;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
//! Low-level Block-oriented I/O functions
|
//! Low-level Block-oriented I/O functions
|
||||||
//!
|
//!
|
||||||
|
|
||||||
|
use super::ephemeral_file::EphemeralFile;
|
||||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||||
@@ -80,7 +81,9 @@ impl<'a> Deref for BlockLease<'a> {
|
|||||||
/// Unlike traits, we also support the read function to be async though.
|
/// Unlike traits, we also support the read function to be async though.
|
||||||
pub(crate) enum BlockReaderRef<'a> {
|
pub(crate) enum BlockReaderRef<'a> {
|
||||||
FileBlockReader(&'a FileBlockReader<'a>),
|
FileBlockReader(&'a FileBlockReader<'a>),
|
||||||
|
EphemeralFile(&'a EphemeralFile),
|
||||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||||
|
Slice(&'a [u8]),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -97,7 +100,9 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
use BlockReaderRef::*;
|
use BlockReaderRef::*;
|
||||||
match self {
|
match self {
|
||||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||||
|
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
||||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||||
|
Slice(s) => Self::read_blk_slice(s, blknum),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
TestDisk(r) => r.read_blk(blknum),
|
TestDisk(r) => r.read_blk(blknum),
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -106,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> BlockReaderRef<'a> {
|
||||||
|
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
||||||
|
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
||||||
|
let end = start.checked_add(PAGE_SZ).unwrap();
|
||||||
|
if end > slice.len() {
|
||||||
|
return Err(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::UnexpectedEof,
|
||||||
|
format!("slice too short, len={} end={}", slice.len(), end),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let slice = &slice[start..end];
|
||||||
|
let page_sized: &[u8; PAGE_SZ] = slice
|
||||||
|
.try_into()
|
||||||
|
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
||||||
|
Ok(BlockLease::Slice(page_sized))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||||
///
|
///
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user