mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 17:40:37 +00:00
Compare commits
3 Commits
jcsp/s3-dr
...
bayandin/1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9e7905979d | ||
|
|
0b3aedb830 | ||
|
|
93d3d98d2a |
4
.github/ISSUE_TEMPLATE/epic-template.md
vendored
4
.github/ISSUE_TEMPLATE/epic-template.md
vendored
@@ -16,9 +16,9 @@ assignees: ''
|
||||
|
||||
## Implementation ideas
|
||||
|
||||
## Tasks
|
||||
|
||||
```[tasklist]
|
||||
- [ ] Example Task
|
||||
### Tasks
|
||||
```
|
||||
|
||||
|
||||
|
||||
479
.github/workflows/benchmarking.yml
vendored
479
.github/workflows/benchmarking.yml
vendored
@@ -49,76 +49,6 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
bench:
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Run benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
# Set --sparse-ordering option of pytest-order plugin
|
||||
# to ensure tests are running in order of appears in the file.
|
||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
generate-matrices:
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
@@ -135,72 +65,31 @@ jobs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
|
||||
tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}
|
||||
pgbench-compare-big-db-matrix: ${{ steps.pgbench-compare-big-db-matrix.outputs.matrix }}
|
||||
|
||||
steps:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
id: pgbench-compare-matrix
|
||||
- name: Generate matrix for pgbench benchmark with big databases
|
||||
id: pgbench-compare-big-db-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" }]
|
||||
}'
|
||||
# There's also `neonvm-captest-reuse` platform, but we don't want to use it
|
||||
# because DevRel team used it for some demos, they might need it again
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Generate matrix for OLAP benchmarks
|
||||
id: olap-compare-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Generate matrix for TPC-H benchmarks
|
||||
id: tpch-compare-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
],
|
||||
"scale": [
|
||||
"10"
|
||||
]
|
||||
"db_size": [ "1tb" ]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
pgbench-compare:
|
||||
pgbench-compare-big-db:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}}
|
||||
matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-big-db-matrix)}}
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
@@ -217,9 +106,6 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -235,32 +121,15 @@ jobs:
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_PGBENCH_1TB_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
neonvm-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_NEONVM_PGBENCH_1TB_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
@@ -270,24 +139,7 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Benchmark simple-update
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -315,13 +167,6 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ steps.create-neon-project.outputs.project_id && always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
@@ -331,300 +176,6 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
# Run this job only when pgbench-compare is finished to avoid the intersection.
|
||||
# We might change it after https://github.com/neondatabase/neon/issues/2900.
|
||||
#
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, pgbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
tpch-compare:
|
||||
# TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare & clickbench-compare.
|
||||
# Run this job only when clickbench-compare is finished to avoid the intersection.
|
||||
# We might change it after https://github.com/neondatabase/neon/issues/2900.
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, clickbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
ENV_PLATFORM=CAPTEST_TPCH
|
||||
;;
|
||||
rds-aurora)
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
rds-postgres)
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
|
||||
echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
user-examples-compare:
|
||||
if: ${{ !cancelled() }}
|
||||
needs: [ generate-matrices, tpch-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERY="SELECT version();"
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
|
||||
fi
|
||||
psql ${CONNSTR} -c "${QUERY}"
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }} (${{ matrix.db_size }}): ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -461,7 +461,6 @@ jobs:
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
@@ -475,7 +474,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
@@ -555,7 +554,7 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
359
Cargo.lock
generated
359
Cargo.lock
generated
@@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -282,10 +282,8 @@ dependencies = [
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
@@ -628,7 +626,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"rustls 0.21.9",
|
||||
"rustls",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
@@ -909,16 +907,6 @@ version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
||||
|
||||
[[package]]
|
||||
name = "bcder"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.3.3"
|
||||
@@ -947,7 +935,7 @@ dependencies = [
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
"which",
|
||||
]
|
||||
|
||||
@@ -998,9 +986,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.5.0"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
|
||||
checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -1161,7 +1149,7 @@ dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1346,7 +1334,6 @@ dependencies = [
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -1587,7 +1574,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1598,7 +1585,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1640,16 +1627,6 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "der"
|
||||
version = "0.7.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
|
||||
dependencies = [
|
||||
"const-oid",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "der-parser"
|
||||
version = "8.2.0"
|
||||
@@ -1704,7 +1681,7 @@ dependencies = [
|
||||
"diesel_table_macro_syntax",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1724,7 +1701,7 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
|
||||
dependencies = [
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1746,7 +1723,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1770,10 +1747,10 @@ version = "0.14.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c"
|
||||
dependencies = [
|
||||
"der 0.6.1",
|
||||
"der",
|
||||
"elliptic-curve",
|
||||
"rfc6979",
|
||||
"signature 1.6.4",
|
||||
"signature",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1790,7 +1767,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"crypto-bigint 0.4.9",
|
||||
"der 0.6.1",
|
||||
"der",
|
||||
"digest",
|
||||
"ff",
|
||||
"generic-array",
|
||||
@@ -1850,7 +1827,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2110,7 +2087,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2493,10 +2470,10 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper",
|
||||
"log",
|
||||
"rustls 0.21.9",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2734,7 +2711,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"js-sys",
|
||||
"pem",
|
||||
"pem 3.0.3",
|
||||
"ring 0.17.6",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2982,9 +2959,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.11"
|
||||
version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
|
||||
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
@@ -3257,7 +3234,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3530,7 +3507,6 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -3740,7 +3716,7 @@ dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3778,6 +3754,16 @@ version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
@@ -3839,7 +3825,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3860,8 +3846,8 @@ version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba"
|
||||
dependencies = [
|
||||
"der 0.6.1",
|
||||
"spki 0.6.0",
|
||||
"der",
|
||||
"spki",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3901,7 +3887,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3914,7 +3900,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3925,7 +3911,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3938,13 +3924,12 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3961,14 +3946,14 @@ dependencies = [
|
||||
"futures",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rustls 0.22.2",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4057,7 +4042,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4068,9 +4053,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.78"
|
||||
version = "1.0.66"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
|
||||
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -4217,8 +4202,8 @@ dependencies = [
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.22.2",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4231,10 +4216,11 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -4262,9 +4248,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.35"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
|
||||
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -4385,12 +4371,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rcgen"
|
||||
version = "0.12.1"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
|
||||
checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976"
|
||||
dependencies = [
|
||||
"pem",
|
||||
"ring 0.17.6",
|
||||
"pem 2.0.1",
|
||||
"ring 0.16.20",
|
||||
"time",
|
||||
"yasna",
|
||||
]
|
||||
@@ -4408,15 +4394,15 @@ dependencies = [
|
||||
"itoa",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.9",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls-pemfile",
|
||||
"rustls-webpki 0.101.7",
|
||||
"ryu",
|
||||
"sha1_smol",
|
||||
"socket2 0.4.9",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"url",
|
||||
]
|
||||
@@ -4562,14 +4548,14 @@ dependencies = [
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.9",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tower-service",
|
||||
"url",
|
||||
@@ -4735,7 +4721,7 @@ dependencies = [
|
||||
"regex",
|
||||
"relative-path",
|
||||
"rustc_version",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
@@ -4819,20 +4805,6 @@ dependencies = [
|
||||
"sct",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.22.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring 0.17.6",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki 0.102.2",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-native-certs"
|
||||
version = "0.6.2"
|
||||
@@ -4840,7 +4812,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50"
|
||||
dependencies = [
|
||||
"openssl-probe",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls-pemfile",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
]
|
||||
@@ -4854,22 +4826,6 @@ dependencies = [
|
||||
"base64 0.21.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pemfile"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pki-types"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.100.2"
|
||||
@@ -4890,17 +4846,6 @@ dependencies = [
|
||||
"untrusted 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.102.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"rustls-pki-types",
|
||||
"untrusted 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.12"
|
||||
@@ -4943,7 +4888,7 @@ dependencies = [
|
||||
"serde_with",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-rustls",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
@@ -5078,7 +5023,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928"
|
||||
dependencies = [
|
||||
"base16ct",
|
||||
"der 0.6.1",
|
||||
"der",
|
||||
"generic-array",
|
||||
"pkcs8",
|
||||
"subtle",
|
||||
@@ -5122,7 +5067,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
|
||||
dependencies = [
|
||||
"httpdate",
|
||||
"reqwest",
|
||||
"rustls 0.21.9",
|
||||
"rustls",
|
||||
"sentry-backtrace",
|
||||
"sentry-contexts",
|
||||
"sentry-core",
|
||||
@@ -5244,7 +5189,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5325,7 +5270,7 @@ dependencies = [
|
||||
"darling",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5347,23 +5292,13 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.8"
|
||||
version = "0.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
|
||||
checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
"sha2-asm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2-asm"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5421,15 +5356,6 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signature"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
|
||||
dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "simple_asn1"
|
||||
version = "0.6.2"
|
||||
@@ -5514,17 +5440,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"der 0.6.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "spki"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"der 0.7.8",
|
||||
"der",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5610,9 +5526,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
||||
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.2"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
|
||||
checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -5627,9 +5543,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.52"
|
||||
version = "2.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
|
||||
checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5744,22 +5660,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.57"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
|
||||
checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.57"
|
||||
version = "1.0.47"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
|
||||
checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5879,10 +5795,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.36.0"
|
||||
name = "tls-listener"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
|
||||
checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"hyper",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
"bytes",
|
||||
@@ -5899,7 +5829,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -5930,7 +5860,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5946,7 +5876,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -5968,17 +5898,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres-rustls"
|
||||
version = "0.11.1"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
|
||||
checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"ring 0.17.6",
|
||||
"rustls 0.22.2",
|
||||
"ring 0.16.20",
|
||||
"rustls",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.25.0",
|
||||
"x509-certificate",
|
||||
"tokio-rustls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5987,18 +5916,7 @@ version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
|
||||
dependencies = [
|
||||
"rustls 0.21.9",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
|
||||
dependencies = [
|
||||
"rustls 0.22.2",
|
||||
"rustls-pki-types",
|
||||
"rustls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -6113,9 +6031,9 @@ dependencies = [
|
||||
"pin-project",
|
||||
"prost",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"rustls-pemfile",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-rustls",
|
||||
"tokio-stream",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
@@ -6211,7 +6129,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6427,7 +6345,7 @@ dependencies = [
|
||||
"base64 0.21.1",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls 0.21.9",
|
||||
"rustls",
|
||||
"rustls-webpki 0.100.2",
|
||||
"url",
|
||||
"webpki-roots 0.23.1",
|
||||
@@ -6436,7 +6354,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
@@ -6479,7 +6397,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
@@ -6518,14 +6435,12 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -6672,7 +6587,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
@@ -6706,7 +6621,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
@@ -7039,19 +6954,19 @@ dependencies = [
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest",
|
||||
"rustls 0.21.9",
|
||||
"ring 0.16.20",
|
||||
"rustls",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"subtle",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
"time",
|
||||
"time-macros",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit",
|
||||
@@ -7062,31 +6977,11 @@ dependencies = [
|
||||
"tungstenite",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
"zstd",
|
||||
"zstd-safe",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "x509-certificate"
|
||||
version = "0.23.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85"
|
||||
dependencies = [
|
||||
"bcder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"der 0.7.8",
|
||||
"hex",
|
||||
"pem",
|
||||
"ring 0.17.6",
|
||||
"signature 2.2.0",
|
||||
"spki 0.7.3",
|
||||
"thiserror",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "x509-parser"
|
||||
version = "0.15.0"
|
||||
@@ -7145,7 +7040,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7153,20 +7048,6 @@ name = "zeroize"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
dependencies = [
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize_derive"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
|
||||
11
Cargo.toml
11
Cargo.toml
@@ -129,8 +129,8 @@ reqwest-retry = "0.2.2"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = "0.22"
|
||||
rustls-pemfile = "2"
|
||||
rustls = "0.21"
|
||||
rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
@@ -156,11 +156,12 @@ test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.11.0"
|
||||
tokio-rustls = "0.25"
|
||||
tokio-postgres-rustls = "0.10.0"
|
||||
tokio-rustls = "0.24"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
@@ -219,7 +220,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.12"
|
||||
rcgen = "0.11"
|
||||
rstest = "0.18"
|
||||
camino-tempfile = "1.0.2"
|
||||
tonic-build = "0.9"
|
||||
|
||||
@@ -53,7 +53,7 @@ RUN set -e \
|
||||
--bin pagectl \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin storage_controller \
|
||||
--bin attachment_service \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
@@ -81,7 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
#
|
||||
|
||||
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop
|
||||
|
||||
## Running tests
|
||||
|
||||
### Rust unit tests
|
||||
|
||||
We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
|
||||
Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
|
||||
You can install `cargo-nextest` with `cargo install cargo-nextest`.
|
||||
|
||||
### Integration tests
|
||||
|
||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||
|
||||
```sh
|
||||
|
||||
@@ -2,13 +2,4 @@ disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
# use std::pin::pin
|
||||
"futures::pin_mut",
|
||||
# cannot disallow this, because clippy finds used from tokio macros
|
||||
#"tokio::pin",
|
||||
]
|
||||
|
||||
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## State Diagram
|
||||
|
||||
Computes can be in various states. Below is a diagram that details how a
|
||||
compute moves between states.
|
||||
|
||||
```mermaid
|
||||
%% https://mermaid.js.org/syntax/stateDiagram.html
|
||||
stateDiagram-v2
|
||||
[*] --> Empty : Compute spawned
|
||||
Empty --> ConfigurationPending : Waiting for compute spec
|
||||
ConfigurationPending --> Configuration : Received compute spec
|
||||
Configuration --> Failed : Failed to configure the compute
|
||||
Configuration --> Running : Compute has been configured
|
||||
Empty --> Init : Compute spec is immediately available
|
||||
Empty --> TerminationPending : Requested termination
|
||||
Init --> Failed : Failed to start Postgres
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPending : Requested termination
|
||||
TerminationPending --> Terminated : Terminated compute
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
Cargo formatter:
|
||||
|
||||
@@ -17,8 +17,6 @@ use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use nix::unistd::Pid;
|
||||
use postgres::error::SqlState;
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -397,9 +395,9 @@ impl ComputeNode {
|
||||
// Gets the basebackup in a retry loop
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500.0;
|
||||
let mut retry_period_ms = 500;
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 10;
|
||||
let max_attempts = 5;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
@@ -411,8 +409,8 @@ impl ComputeNode {
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
|
||||
retry_period_ms *= 2;
|
||||
}
|
||||
Err(_) => {
|
||||
return result;
|
||||
@@ -723,12 +721,8 @@ impl ComputeNode {
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
// SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
|
||||
// it to avoid orphaned processes prowling around while datadir is
|
||||
// wiped.
|
||||
let pm_pid = Pid::from_raw(pg.id() as i32);
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
@@ -769,26 +763,6 @@ impl ComputeNode {
|
||||
Ok((pg, logs_handle))
|
||||
}
|
||||
|
||||
/// Do post configuration of the already started Postgres. This function spawns a background thread to
|
||||
/// configure the database after applying the compute spec. Currently, it upgrades the neon extension
|
||||
/// version. In the future, it may upgrade all 3rd-party extensions.
|
||||
#[instrument(skip_all)]
|
||||
pub fn post_apply_config(&self) -> Result<()> {
|
||||
let connstr = self.connstr.clone();
|
||||
thread::spawn(move || {
|
||||
let func = || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_neon_extension_upgrade(&mut client)
|
||||
.context("handle_neon_extension_upgrade")?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
if let Err(err) = func() {
|
||||
error!("error while post_apply_config: {err:#}");
|
||||
}
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
@@ -800,34 +774,27 @@ impl ComputeNode {
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let connstr = self.connstr.clone();
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
Err(e) => match e.code() {
|
||||
Some(&SqlState::INVALID_PASSWORD)
|
||||
| Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
|
||||
// connect with zenith_admin if cloud_admin could not authenticate
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = connstr.clone();
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = connstr.clone();
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
|
||||
let mut client =
|
||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connstring with expected name
|
||||
Client::connect(connstr.as_str(), NoTls)?
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
},
|
||||
// reconnect with connstring with expected name
|
||||
Client::connect(connstr.as_str(), NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
@@ -1023,21 +990,18 @@ impl ComputeNode {
|
||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary {
|
||||
if !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
self.post_apply_config()?;
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
let startup_end_time = Utc::now();
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
// from neon_superuser.
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -743,21 +743,9 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension schema with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -807,18 +795,6 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -12,7 +12,6 @@ clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -4,10 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "storage_controller"
|
||||
path = "src/main.rs"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs and behaviors
|
||||
@@ -19,10 +15,8 @@ aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}';
|
||||
UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}';
|
||||
@@ -1,3 +0,0 @@
|
||||
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}';
|
||||
UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"';
|
||||
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -19,66 +19,8 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
|
||||
|
||||
pub(crate) const API_CONCURRENCY: usize = 32;
|
||||
|
||||
struct ShardedComputeHookTenant {
|
||||
stripe_size: ShardStripeSize,
|
||||
shard_count: ShardCount,
|
||||
shards: Vec<(ShardNumber, NodeId)>,
|
||||
}
|
||||
|
||||
enum ComputeHookTenant {
|
||||
Unsharded(NodeId),
|
||||
Sharded(ShardedComputeHookTenant),
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
/// Construct with at least one shard's information
|
||||
fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
|
||||
if tenant_shard_id.shard_count.count() > 1 {
|
||||
Self::Sharded(ShardedComputeHookTenant {
|
||||
shards: vec![(tenant_shard_id.shard_number, node_id)],
|
||||
stripe_size,
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
})
|
||||
} else {
|
||||
Self::Unsharded(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
|
||||
/// and drops existing content.
|
||||
fn update(
|
||||
&mut self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
stripe_size: ShardStripeSize,
|
||||
node_id: NodeId,
|
||||
) {
|
||||
match self {
|
||||
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
|
||||
*existing_node_id = node_id
|
||||
}
|
||||
Self::Sharded(sharded_tenant)
|
||||
if sharded_tenant.stripe_size == stripe_size
|
||||
&& sharded_tenant.shard_count == tenant_shard_id.shard_count =>
|
||||
{
|
||||
if let Some(existing) = sharded_tenant
|
||||
.shards
|
||||
.iter()
|
||||
.position(|s| s.0 == tenant_shard_id.shard_number)
|
||||
{
|
||||
sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
|
||||
} else {
|
||||
sharded_tenant
|
||||
.shards
|
||||
.push((tenant_shard_id.shard_number, node_id));
|
||||
sharded_tenant.shards.sort_by_key(|s| s.0)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Shard count changed: reset struct.
|
||||
*self = Self::new(tenant_shard_id, stripe_size, node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(super) struct ComputeHookTenant {
|
||||
shards: Vec<(ShardIndex, NodeId)>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -91,7 +33,6 @@ struct ComputeHookNotifyRequestShard {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct ComputeHookNotifyRequest {
|
||||
tenant_id: TenantId,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
shards: Vec<ComputeHookNotifyRequestShard>,
|
||||
}
|
||||
|
||||
@@ -122,43 +63,42 @@ pub(crate) enum NotifyError {
|
||||
}
|
||||
|
||||
impl ComputeHookTenant {
|
||||
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||
match self {
|
||||
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards: vec![ComputeHookNotifyRequestShard {
|
||||
shard_number: ShardNumber(0),
|
||||
node_id: *node_id,
|
||||
}],
|
||||
stripe_size: None,
|
||||
}),
|
||||
Self::Sharded(sharded_tenant)
|
||||
if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
|
||||
{
|
||||
Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards: sharded_tenant
|
||||
.shards
|
||||
.iter()
|
||||
.map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
|
||||
shard_number: *shard_number,
|
||||
node_id: *node_id,
|
||||
})
|
||||
.collect(),
|
||||
stripe_size: Some(sharded_tenant.stripe_size),
|
||||
})
|
||||
}
|
||||
Self::Sharded(sharded_tenant) => {
|
||||
// Sharded tenant doesn't yet have information for all its shards
|
||||
async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
|
||||
// Find the highest shard count and drop any shards that aren't
|
||||
// for that shard count.
|
||||
let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
|
||||
let Some(shard_count) = shard_count else {
|
||||
// No shards, nothing to do.
|
||||
tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
|
||||
return None;
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
sharded_tenant.shards.len(),
|
||||
sharded_tenant.shard_count.count()
|
||||
);
|
||||
None
|
||||
}
|
||||
self.shards.retain(|(k, _v)| k.shard_count == shard_count);
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
|
||||
// We have pageservers for all the shards: emit a configuration update
|
||||
return Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards: self
|
||||
.shards
|
||||
.iter()
|
||||
.map(|(shard, node_id)| ComputeHookNotifyRequestShard {
|
||||
shard_number: shard.shard_number,
|
||||
node_id: *node_id,
|
||||
})
|
||||
.collect(),
|
||||
});
|
||||
} else {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.count()
|
||||
);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,11 +139,7 @@ impl ComputeHook {
|
||||
};
|
||||
let cplane =
|
||||
ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
|
||||
let ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
shards,
|
||||
stripe_size,
|
||||
} = reconfigure_request;
|
||||
let ComputeHookNotifyRequest { tenant_id, shards } = reconfigure_request;
|
||||
|
||||
let compute_pageservers = shards
|
||||
.into_iter()
|
||||
@@ -220,9 +156,7 @@ impl ComputeHook {
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint
|
||||
.reconfigure(compute_pageservers.clone(), stripe_size)
|
||||
.await?;
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -337,26 +271,30 @@ impl ComputeHook {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
stripe_size: ShardStripeSize,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), NotifyError> {
|
||||
let mut locked = self.state.lock().await;
|
||||
let entry = locked
|
||||
.entry(tenant_shard_id.tenant_id)
|
||||
.or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
|
||||
tenant_shard_id,
|
||||
stripe_size,
|
||||
node_id,
|
||||
)),
|
||||
Entry::Occupied(e) => {
|
||||
let tenant = e.into_mut();
|
||||
tenant.update(tenant_shard_id, stripe_size, node_id);
|
||||
tenant
|
||||
}
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: tenant_shard_id.shard_count,
|
||||
shard_number: tenant_shard_id.shard_number,
|
||||
};
|
||||
|
||||
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
|
||||
let mut set = false;
|
||||
for (existing_shard, existing_node) in &mut entry.shards {
|
||||
if *existing_shard == shard_index {
|
||||
*existing_node = node_id;
|
||||
set = true;
|
||||
}
|
||||
}
|
||||
if !set {
|
||||
entry.shards.push((shard_index, node_id));
|
||||
}
|
||||
|
||||
let reconfigure_request = entry.maybe_reconfigure(tenant_shard_id.tenant_id).await;
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
@@ -378,85 +316,3 @@ impl ComputeHook {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tenant_updates() -> anyhow::Result<()> {
|
||||
let tenant_id = TenantId::generate();
|
||||
let mut tenant_state = ComputeHookTenant::new(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(0),
|
||||
shard_number: ShardNumber(0),
|
||||
},
|
||||
ShardStripeSize(12345),
|
||||
NodeId(1),
|
||||
);
|
||||
|
||||
// An unsharded tenant is always ready to emit a notification
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.shards
|
||||
.len(),
|
||||
1
|
||||
);
|
||||
assert!(tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.stripe_size
|
||||
.is_none());
|
||||
|
||||
// Writing the first shard of a multi-sharded situation (i.e. in a split)
|
||||
// resets the tenant state and puts it in an non-notifying state (need to
|
||||
// see all shards)
|
||||
tenant_state.update(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(2),
|
||||
shard_number: ShardNumber(1),
|
||||
},
|
||||
ShardStripeSize(32768),
|
||||
NodeId(1),
|
||||
);
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
|
||||
|
||||
// Writing the second shard makes it ready to notify
|
||||
tenant_state.update(
|
||||
TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(2),
|
||||
shard_number: ShardNumber(0),
|
||||
},
|
||||
ShardStripeSize(32768),
|
||||
NodeId(1),
|
||||
);
|
||||
|
||||
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.shards
|
||||
.len(),
|
||||
2
|
||||
);
|
||||
assert_eq!(
|
||||
tenant_state
|
||||
.maybe_reconfigure(tenant_id)
|
||||
.unwrap()
|
||||
.stripe_size,
|
||||
Some(ShardStripeSize(32768))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,227 +0,0 @@
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use pageserver_api::{
|
||||
controller_api::{NodeAvailability, UtilizationScore},
|
||||
models::PageserverUtilization,
|
||||
};
|
||||
|
||||
use thiserror::Error;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::node::Node;
|
||||
|
||||
struct HeartbeaterTask {
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
cancel: CancellationToken,
|
||||
|
||||
state: HashMap<NodeId, PageserverState>,
|
||||
|
||||
max_unavailable_interval: Duration,
|
||||
jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum PageserverState {
|
||||
Available {
|
||||
last_seen_at: Instant,
|
||||
utilization: PageserverUtilization,
|
||||
},
|
||||
Offline,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub(crate) enum HeartbeaterError {
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
}
|
||||
|
||||
struct HeartbeatRequest {
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
|
||||
}
|
||||
|
||||
pub(crate) struct Heartbeater {
|
||||
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
|
||||
}
|
||||
|
||||
impl Heartbeater {
|
||||
pub(crate) fn new(
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
|
||||
let mut heartbeater =
|
||||
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
|
||||
tokio::task::spawn(async move { heartbeater.run().await });
|
||||
|
||||
Self { sender }
|
||||
}
|
||||
|
||||
pub(crate) async fn heartbeat(
|
||||
&self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let (sender, receiver) = tokio::sync::oneshot::channel();
|
||||
self.sender
|
||||
.send(HeartbeatRequest {
|
||||
pageservers,
|
||||
reply: sender,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
receiver.await.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl HeartbeaterTask {
|
||||
fn new(
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
receiver,
|
||||
cancel,
|
||||
state: HashMap::new(),
|
||||
max_unavailable_interval,
|
||||
jwt_token,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(&mut self) {
|
||||
loop {
|
||||
tokio::select! {
|
||||
request = self.receiver.recv() => {
|
||||
match request {
|
||||
Some(req) => {
|
||||
let res = self.heartbeat(req.pageservers).await;
|
||||
req.reply.send(res).unwrap();
|
||||
},
|
||||
None => { return; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn heartbeat(
|
||||
&mut self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let mut new_state = HashMap::new();
|
||||
|
||||
let mut heartbeat_futs = FuturesUnordered::new();
|
||||
for (node_id, node) in &*pageservers {
|
||||
heartbeat_futs.push({
|
||||
let jwt_token = self.jwt_token.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Clone the node and mark it as available such that the request
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||
let mut node = node.clone();
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
async move {
|
||||
let response = node
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
2,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
let response = match response {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
// This indicates cancellation of the request.
|
||||
// We ignore the node in this case.
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let status = if let Ok(utilization) = response {
|
||||
PageserverState::Available {
|
||||
last_seen_at: Instant::now(),
|
||||
utilization,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
};
|
||||
|
||||
Some((*node_id, status))
|
||||
}
|
||||
});
|
||||
|
||||
loop {
|
||||
let maybe_status = tokio::select! {
|
||||
next = heartbeat_futs.next() => {
|
||||
match next {
|
||||
Some(result) => result,
|
||||
None => { break; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
|
||||
};
|
||||
|
||||
if let Some((node_id, status)) = maybe_status {
|
||||
new_state.insert(node_id, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deltas = Vec::new();
|
||||
let now = Instant::now();
|
||||
for (node_id, ps_state) in new_state {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
let entry = self.state.entry(node_id);
|
||||
|
||||
let mut needs_update = false;
|
||||
match entry {
|
||||
Occupied(ref occ) => match (occ.get(), &ps_state) {
|
||||
(PageserverState::Offline, PageserverState::Offline) => {}
|
||||
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
|
||||
if now - *last_seen_at >= self.max_unavailable_interval {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
},
|
||||
Vacant(_) => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
match entry {
|
||||
Occupied(mut occ) if needs_update => {
|
||||
(*occ.get_mut()) = ps_state;
|
||||
}
|
||||
Vacant(vac) => {
|
||||
vac.insert(ps_state);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(AvailablityDeltas(deltas))
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use crate::PlacementPolicy;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
@@ -10,11 +11,9 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -28,11 +27,11 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
@@ -120,9 +119,13 @@ async fn handle_tenant_create(
|
||||
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
|
||||
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
|
||||
// have no expectation of HA).
|
||||
let placement_policy = PlacementPolicy::Single;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
service.tenant_create(create_req).await?,
|
||||
service.tenant_create(create_req, placement_policy).await?,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -176,14 +179,14 @@ async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_location_config(tenant_shard_id, config_req)
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
@@ -248,10 +251,8 @@ async fn handle_tenant_secondary_download(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
|
||||
|
||||
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
|
||||
json_response(status, progress)
|
||||
service.tenant_secondary_download(tenant_id).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
@@ -353,16 +354,6 @@ async fn handle_tenant_locate(
|
||||
json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_describe(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -401,14 +392,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.node_configure(
|
||||
config_req.node_id,
|
||||
config_req.availability.map(NodeAvailability::from),
|
||||
config_req.scheduling,
|
||||
)
|
||||
.await?,
|
||||
state.service.node_configure(config_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -569,17 +553,14 @@ pub fn make_router(
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
@@ -596,9 +577,6 @@ pub fn make_router(
|
||||
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_split)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_describe)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
@@ -614,7 +592,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
|
||||
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
|
||||
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
|
||||
/// is needed at a tenant-wide granularity.
|
||||
pub(crate) struct IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
/// A synchronous lock for getting/setting the async locks that our callers will wait on.
|
||||
entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
|
||||
}
|
||||
|
||||
impl<T> IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
pub(crate) fn shared(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().read_owned()
|
||||
}
|
||||
|
||||
pub(crate) fn exclusive(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().write_owned()
|
||||
}
|
||||
|
||||
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
|
||||
/// periodic housekeeping to avoid the map growing indefinitely
|
||||
pub(crate) fn housekeeping(&self) {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
locked.retain(|_k, lock| lock.try_write().is_err())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entities: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,9 @@
|
||||
use serde::Serialize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
@@ -15,6 +13,23 @@ mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
|
||||
/// Do not attach to any pageservers. This is appropriate for tenants that
|
||||
/// have been idle for a long time, where we do not mind some delay in making
|
||||
/// them available in future.
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||
struct Sequence(u64);
|
||||
|
||||
@@ -51,3 +66,9 @@ impl Sequence {
|
||||
Sequence(self.0 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PlacementPolicy {
|
||||
fn default() -> Self {
|
||||
PlacementPolicy::Double(1)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
@@ -54,10 +60,6 @@ struct Cli {
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
@@ -210,12 +212,6 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let default_panic = std::panic::take_hook();
|
||||
std::panic::set_hook(Box::new(move |info| {
|
||||
default_panic(info);
|
||||
std::process::exit(1);
|
||||
}));
|
||||
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
// We use spawn_blocking for database operations, so require approximately
|
||||
// as many blocking threads as we will open database connections.
|
||||
@@ -253,10 +249,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
max_unavailable_interval: args
|
||||
.max_unavailable_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
use std::{str::FromStr, time::Duration};
|
||||
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api;
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, id::NodeId};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
@@ -22,29 +12,16 @@ use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
/// implementation of serialization on this type is only for debug dumps.
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct Node {
|
||||
id: NodeId,
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
availability: NodeAvailability,
|
||||
scheduling: NodeSchedulingPolicy,
|
||||
pub(crate) availability: NodeAvailability,
|
||||
pub(crate) scheduling: NodeSchedulingPolicy,
|
||||
|
||||
listen_http_addr: String,
|
||||
listen_http_port: u16,
|
||||
pub(crate) listen_http_addr: String,
|
||||
pub(crate) listen_http_port: u16,
|
||||
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
|
||||
// This cancellation token means "stop any RPCs in flight to this node, and don't start
|
||||
// any more". It is not related to process shutdown.
|
||||
#[serde(skip)]
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// When updating [`Node::availability`] we use this type to indicate to the caller
|
||||
/// whether/how they changed it.
|
||||
pub(crate) enum AvailabilityTransition {
|
||||
ToActive,
|
||||
ToOffline,
|
||||
Unchanged,
|
||||
pub(crate) listen_pg_addr: String,
|
||||
pub(crate) listen_pg_port: u16,
|
||||
}
|
||||
|
||||
impl Node {
|
||||
@@ -52,111 +29,18 @@ impl Node {
|
||||
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
|
||||
}
|
||||
|
||||
pub(crate) fn get_id(&self) -> NodeId {
|
||||
self.id
|
||||
}
|
||||
|
||||
pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
|
||||
self.scheduling = scheduling
|
||||
}
|
||||
|
||||
/// Does this registration request match `self`? This is used when deciding whether a registration
|
||||
/// request should be allowed to update an existing record with the same node ID.
|
||||
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
||||
self.id == register_req.node_id
|
||||
&& self.listen_http_addr == register_req.listen_http_addr
|
||||
&& self.listen_http_port == register_req.listen_http_port
|
||||
&& self.listen_pg_addr == register_req.listen_pg_addr
|
||||
&& self.listen_pg_port == register_req.listen_pg_port
|
||||
}
|
||||
|
||||
/// For a shard located on this node, populate a response object
|
||||
/// with this node's address information.
|
||||
pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
|
||||
TenantLocateResponseShard {
|
||||
shard_id,
|
||||
node_id: self.id,
|
||||
listen_http_addr: self.listen_http_addr.clone(),
|
||||
listen_http_port: self.listen_http_port,
|
||||
listen_pg_addr: self.listen_pg_addr.clone(),
|
||||
listen_pg_port: self.listen_pg_port,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||
match self.get_availability_transition(availability) {
|
||||
AvailabilityTransition::ToActive => {
|
||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||
// users of previously-cloned copies of the node will still see the old cancellation
|
||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||
// again to realize that the node has become available.
|
||||
self.cancel = CancellationToken::new();
|
||||
}
|
||||
AvailabilityTransition::ToOffline => {
|
||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||
self.cancel.cancel();
|
||||
}
|
||||
AvailabilityTransition::Unchanged => {}
|
||||
}
|
||||
self.availability = availability;
|
||||
}
|
||||
|
||||
/// Without modifying the availability of the node, convert the intended availability
|
||||
/// into a description of the transition.
|
||||
pub(crate) fn get_availability_transition(
|
||||
&self,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::*;
|
||||
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
_ => Unchanged,
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether we may send API requests to this node.
|
||||
pub(crate) fn is_available(&self) -> bool {
|
||||
// When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds
|
||||
// a reference to the original Node's cancellation status. Checking both of these results
|
||||
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
||||
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
||||
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
||||
}
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||
let score = match self.availability {
|
||||
NodeAvailability::Active(score) => score,
|
||||
NodeAvailability::Offline => return MaySchedule::No,
|
||||
};
|
||||
pub(crate) fn may_schedule(&self) -> bool {
|
||||
match self.availability {
|
||||
NodeAvailability::Active => {}
|
||||
NodeAvailability::Offline => return false,
|
||||
}
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
id: NodeId,
|
||||
listen_http_addr: String,
|
||||
listen_http_port: u16,
|
||||
listen_pg_addr: String,
|
||||
listen_pg_port: u16,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
scheduling: NodeSchedulingPolicy::Filling,
|
||||
availability: NodeAvailability::Offline,
|
||||
cancel: CancellationToken::new(),
|
||||
NodeSchedulingPolicy::Active => true,
|
||||
NodeSchedulingPolicy::Draining => false,
|
||||
NodeSchedulingPolicy::Filling => true,
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,96 +54,4 @@ impl Node {
|
||||
listen_pg_port: self.listen_pg_port as i32,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn from_persistent(np: NodePersistence) -> Self {
|
||||
Self {
|
||||
id: NodeId(np.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: np.listen_http_addr,
|
||||
listen_http_port: np.listen_http_port as u16,
|
||||
listen_pg_addr: np.listen_pg_addr,
|
||||
listen_pg_port: np.listen_pg_port as u16,
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for issuing requests to pageserver management API: takes care of generic
|
||||
/// retry/backoff for retryable HTTP status codes.
|
||||
///
|
||||
/// This will return None to indicate cancellation. Cancellation may happen from
|
||||
/// the cancellation token passed in, or from Self's cancellation token (i.e. node
|
||||
/// going offline).
|
||||
pub(crate) async fn with_client_retries<T, O, F>(
|
||||
&self,
|
||||
mut op: O,
|
||||
jwt: &Option<String>,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
timeout: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<mgmt_api::Result<T>>
|
||||
where
|
||||
O: FnMut(mgmt_api::Client) -> F,
|
||||
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
||||
{
|
||||
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||
use mgmt_api::Error::*;
|
||||
match e {
|
||||
ReceiveBody(_) | ReceiveErrorBody(_) => false,
|
||||
ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
|
||||
| ApiError(StatusCode::GATEWAY_TIMEOUT, _)
|
||||
| ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
|
||||
ApiError(_, _) => true,
|
||||
Cancelled => true,
|
||||
}
|
||||
}
|
||||
|
||||
backoff::retry(
|
||||
|| {
|
||||
let http_client = reqwest::ClientBuilder::new()
|
||||
.timeout(timeout)
|
||||
.build()
|
||||
.expect("Failed to construct HTTP client");
|
||||
|
||||
let client =
|
||||
mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
|
||||
|
||||
let node_cancel_fut = self.cancel.cancelled();
|
||||
|
||||
let op_fut = op(client);
|
||||
|
||||
async {
|
||||
tokio::select! {
|
||||
r = op_fut=> {r},
|
||||
_ = node_cancel_fut => {
|
||||
Err(mgmt_api::Error::Cancelled)
|
||||
}}
|
||||
}
|
||||
},
|
||||
is_fatal,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
&format!(
|
||||
"Call to node {} ({}:{}) management API",
|
||||
self.id, self.listen_http_addr, self.listen_http_port
|
||||
),
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Node {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} ({})", self.id, self.listen_http_addr)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Node {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{} ({})", self.id, self.listen_http_addr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,23 +7,23 @@ use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use diesel::{
|
||||
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
|
||||
Selectable, SelectableHelper,
|
||||
};
|
||||
use pageserver_api::controller_api::NodeSchedulingPolicy;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::node::Node;
|
||||
use crate::PlacementPolicy;
|
||||
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The storage controller service does not store most of its state durably.
|
||||
/// The attachment service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
@@ -37,7 +37,7 @@ use crate::node::Node;
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The storage controller service does not go via the database for most things: there are
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
@@ -75,14 +75,6 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
Aborted,
|
||||
/// The split had already been persisted.
|
||||
Complete,
|
||||
}
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
@@ -211,10 +203,15 @@ impl Persistence {
|
||||
|
||||
let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
|
||||
.map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
|
||||
for shard in decoded.tenants.values_mut() {
|
||||
if shard.placement_policy == "\"Single\"" {
|
||||
// Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
|
||||
shard.placement_policy = "{\"Attached\":0}".to_string();
|
||||
for (tenant_id, tenant) in &mut decoded.tenants {
|
||||
// Backward compat: an old attachments.json from before PR #6251, replace
|
||||
// empty strings with proper defaults.
|
||||
if tenant.tenant_id.is_empty() {
|
||||
tenant.tenant_id = tenant_id.to_string();
|
||||
tenant.config = serde_json::to_string(&TenantConfig::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -576,51 +573,6 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Used when the remote part of a shard split failed: we will revert the database state to have only
|
||||
/// the parent shards, with SplitState::Idle.
|
||||
pub(crate) async fn abort_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
@@ -655,28 +607,6 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
|
||||
if self.shard_count == 0 {
|
||||
Ok(ShardIdentity::unsharded())
|
||||
} else {
|
||||
Ok(ShardIdentity::new(
|
||||
ShardNumber(self.shard_number as u8),
|
||||
ShardCount::new(self.shard_count as u8),
|
||||
ShardStripeSize(self.shard_stripe_size as u32),
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -8,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
@@ -19,8 +19,6 @@ use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
@@ -30,16 +28,15 @@ pub(super) struct Reconciler {
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) intent: TargetState,
|
||||
|
||||
/// Nodes not referenced by [`Self::intent`], from which we should try
|
||||
/// to detach this tenant shard.
|
||||
pub(crate) detach: Vec<Node>,
|
||||
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
pub(crate) service_config: service::Config,
|
||||
|
||||
/// A snapshot of the pageservers as they were when we were asked
|
||||
/// to reconcile.
|
||||
pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
|
||||
/// A hook to notify the running postgres instances when we change the location
|
||||
/// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag
|
||||
/// and guarantee eventual retries.
|
||||
@@ -70,37 +67,29 @@ pub(super) struct Reconciler {
|
||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TargetState {
|
||||
pub(crate) attached: Option<Node>,
|
||||
pub(crate) secondary: Vec<Node>,
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
impl TargetState {
|
||||
pub(crate) fn from_intent(nodes: &HashMap<NodeId, Node>, intent: &IntentState) -> Self {
|
||||
pub(crate) fn from_intent(intent: &IntentState) -> Self {
|
||||
Self {
|
||||
attached: intent.get_attached().map(|n| {
|
||||
nodes
|
||||
.get(&n)
|
||||
.expect("Intent attached referenced non-existent node")
|
||||
.clone()
|
||||
}),
|
||||
secondary: intent
|
||||
.get_secondary()
|
||||
.iter()
|
||||
.map(|n| {
|
||||
nodes
|
||||
.get(n)
|
||||
.expect("Intent secondary referenced non-existent node")
|
||||
.clone()
|
||||
})
|
||||
.collect(),
|
||||
attached: *intent.get_attached(),
|
||||
secondary: intent.get_secondary().clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = self.secondary.clone();
|
||||
if let Some(node_id) = &self.attached {
|
||||
result.push(*node_id);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Remote(#[from] mgmt_api::Error),
|
||||
#[error(transparent)]
|
||||
Notify(#[from] NotifyError),
|
||||
#[error("Cancelled")]
|
||||
@@ -112,83 +101,44 @@ pub(crate) enum ReconcileError {
|
||||
impl Reconciler {
|
||||
async fn location_config(
|
||||
&mut self,
|
||||
node: &Node,
|
||||
node_id: NodeId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<(), ReconcileError> {
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: None });
|
||||
|
||||
// TODO: amend locations that use long-polling: they will hit this timeout.
|
||||
let timeout = Duration::from_secs(25);
|
||||
|
||||
tracing::info!("location_config({node}) calling: {:?}", config);
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
let config_ref = &config;
|
||||
match node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
let config = config_ref.clone();
|
||||
client
|
||||
.location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
|
||||
.await
|
||||
},
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
3,
|
||||
timeout,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Err(ReconcileError::Cancel),
|
||||
};
|
||||
tracing::info!("location_config({node}) complete: {:?}", config);
|
||||
) -> anyhow::Result<()> {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(&node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
|
||||
.insert(node.id, ObservedStateLocation { conf: None });
|
||||
|
||||
tracing::info!("location_config({}) calling: {:?}", node_id, config);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
client
|
||||
.location_config(self.tenant_shard_id, config.clone(), flush_ms)
|
||||
.await?;
|
||||
tracing::info!("location_config({}) complete: {:?}", node_id, config);
|
||||
|
||||
self.observed
|
||||
.locations
|
||||
.insert(node.id, ObservedStateLocation { conf: Some(config) });
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_node(&self, node_id: &NodeId) -> Option<&Node> {
|
||||
if let Some(node) = self.intent.attached.as_ref() {
|
||||
if node.get_id() == *node_id {
|
||||
return Some(node);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(node) = self
|
||||
.intent
|
||||
.secondary
|
||||
.iter()
|
||||
.find(|n| n.get_id() == *node_id)
|
||||
{
|
||||
return Some(node);
|
||||
}
|
||||
|
||||
if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) {
|
||||
return Some(node);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
|
||||
let destination = if let Some(node) = &self.intent.attached {
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
let destination = if let Some(node_id) = self.intent.attached {
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) => {
|
||||
// We will do a live migration only if the intended destination is not
|
||||
// currently in an attached state.
|
||||
match &conf.conf {
|
||||
Some(conf) if conf.mode == LocationConfigMode::Secondary => {
|
||||
// Fall through to do a live migration
|
||||
node
|
||||
node_id
|
||||
}
|
||||
None | Some(_) => {
|
||||
// Attached or uncertain: don't do a live migration, proceed
|
||||
@@ -201,7 +151,7 @@ impl Reconciler {
|
||||
None => {
|
||||
// Our destination is not attached: maybe live migrate if some other
|
||||
// node is currently attached. Fall through.
|
||||
node
|
||||
node_id
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -214,13 +164,15 @@ impl Reconciler {
|
||||
for (node_id, state) in &self.observed.locations {
|
||||
if let Some(observed_conf) = &state.conf {
|
||||
if observed_conf.mode == LocationConfigMode::AttachedSingle {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
// We will only attempt live migration if the origin is not offline: this
|
||||
// avoids trying to do it while reconciling after responding to an HA failover.
|
||||
if let Some(node) = self.get_node(node_id) {
|
||||
if node.is_available() {
|
||||
origin = Some(node.clone());
|
||||
break;
|
||||
}
|
||||
if !matches!(node.availability, NodeAvailability::Offline) {
|
||||
origin = Some(*node_id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -233,7 +185,7 @@ impl Reconciler {
|
||||
|
||||
// We have an origin and a destination: proceed to do the live migration
|
||||
tracing::info!("Live migrating {}->{}", origin, destination);
|
||||
self.live_migrate(origin, destination.clone()).await?;
|
||||
self.live_migrate(origin, destination).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -241,8 +193,13 @@ impl Reconciler {
|
||||
async fn get_lsns(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
node_id: &NodeId,
|
||||
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
@@ -253,86 +210,19 @@ impl Reconciler {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn secondary_download(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
) -> Result<(), ReconcileError> {
|
||||
// This is not the timeout for a request, but the total amount of time we're willing to wait
|
||||
// for a secondary location to get up to date before
|
||||
const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
|
||||
let node = self
|
||||
.pageservers
|
||||
.get(node_id)
|
||||
.expect("Pageserver may not be removed while referenced");
|
||||
|
||||
// This the long-polling interval for the secondary download requests we send to destination pageserver
|
||||
// during a migration.
|
||||
const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
||||
let client =
|
||||
mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
loop {
|
||||
let (status, progress) = match node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(
|
||||
tenant_shard_id,
|
||||
Some(REQUEST_DOWNLOAD_TIMEOUT),
|
||||
)
|
||||
.await
|
||||
},
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
3,
|
||||
REQUEST_DOWNLOAD_TIMEOUT * 2,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
None => Err(ReconcileError::Cancel),
|
||||
Some(Ok(v)) => Ok(v),
|
||||
Some(Err(e)) => {
|
||||
// Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
|
||||
// attaching, but we should not let an issue with a secondary location stop us proceeding
|
||||
// with a live migration.
|
||||
tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
|
||||
return Ok(());
|
||||
}
|
||||
}?;
|
||||
|
||||
if status == StatusCode::OK {
|
||||
tracing::info!(
|
||||
"Downloads to {} complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
return Ok(());
|
||||
} else if status == StatusCode::ACCEPTED {
|
||||
let total_runtime = started_at.elapsed();
|
||||
if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
|
||||
tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes",
|
||||
total_runtime.as_millis(),
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
// Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
|
||||
// it just makes the I/O performance for users less good.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call
|
||||
// to the pageserver is a long-poll.
|
||||
tracing::info!(
|
||||
"Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
match client.tenant_secondary_download(tenant_shard_id).await {
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
tracing::info!(" (skipping, destination wasn't in secondary mode)")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -340,14 +230,17 @@ impl Reconciler {
|
||||
async fn await_lsn(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
pageserver_id: &NodeId,
|
||||
baseline: HashMap<TimelineId, Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let latest = match self.get_lsns(tenant_shard_id, node).await {
|
||||
let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
|
||||
println!(
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver_id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
continue;
|
||||
}
|
||||
@@ -357,7 +250,7 @@ impl Reconciler {
|
||||
for (timeline_id, baseline_lsn) in &baseline {
|
||||
match latest.get(timeline_id) {
|
||||
Some(latest_lsn) => {
|
||||
tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
|
||||
if latest_lsn < baseline_lsn {
|
||||
any_behind = true;
|
||||
}
|
||||
@@ -372,7 +265,7 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
if !any_behind {
|
||||
tracing::info!("✅ LSN caught up. Proceeding...");
|
||||
println!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
@@ -384,11 +277,11 @@ impl Reconciler {
|
||||
|
||||
pub async fn live_migrate(
|
||||
&mut self,
|
||||
origin_ps: Node,
|
||||
dest_ps: Node,
|
||||
) -> Result<(), ReconcileError> {
|
||||
origin_ps_id: NodeId,
|
||||
dest_ps_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
// `maybe_live_migrate` is responsibble for sanity of inputs
|
||||
assert!(origin_ps.get_id() != dest_ps.get_id());
|
||||
assert!(origin_ps_id != dest_ps_id);
|
||||
|
||||
fn build_location_config(
|
||||
shard: &ShardIdentity,
|
||||
@@ -408,7 +301,10 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",);
|
||||
tracing::info!(
|
||||
"🔁 Switching origin pageserver {} to stale mode",
|
||||
origin_ps_id
|
||||
);
|
||||
|
||||
// FIXME: it is incorrect to use self.generation here, we should use the generation
|
||||
// from the ObservedState of the origin pageserver (it might be older than self.generation)
|
||||
@@ -419,18 +315,21 @@ impl Reconciler {
|
||||
self.generation,
|
||||
None,
|
||||
);
|
||||
self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false)
|
||||
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
.await?;
|
||||
|
||||
let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?);
|
||||
let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
|
||||
|
||||
// If we are migrating to a destination that has a secondary location, warm it up first
|
||||
if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) {
|
||||
if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
|
||||
if let Some(destination_conf) = &destination_conf.conf {
|
||||
if destination_conf.mode == LocationConfigMode::Secondary {
|
||||
tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",);
|
||||
self.secondary_download(self.tenant_shard_id, &dest_ps)
|
||||
.await?;
|
||||
tracing::info!(
|
||||
"🔁 Downloading latest layers to destination pageserver {}",
|
||||
dest_ps_id,
|
||||
);
|
||||
self.secondary_download(self.tenant_shard_id, &dest_ps_id)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -438,7 +337,7 @@ impl Reconciler {
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = Some(
|
||||
self.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps.get_id())
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?,
|
||||
);
|
||||
|
||||
@@ -450,23 +349,22 @@ impl Reconciler {
|
||||
None,
|
||||
);
|
||||
|
||||
tracing::info!("🔁 Attaching to pageserver {dest_ps}");
|
||||
self.location_config(&dest_ps, dest_conf, None, false)
|
||||
.await?;
|
||||
tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
|
||||
self.location_config(dest_ps_id, dest_conf, None).await?;
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||
self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
|
||||
self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
|
||||
.await?;
|
||||
}
|
||||
|
||||
tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");
|
||||
tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
|
||||
|
||||
// During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
|
||||
// the origin without notifying compute, we will render the tenant unavailable.
|
||||
while let Err(e) = self.compute_notify().await {
|
||||
match e {
|
||||
NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
|
||||
NotifyError::Fatal(_) => return Err(anyhow::anyhow!(e)),
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Live migration blocked by compute notification error, retrying: {e}"
|
||||
@@ -475,7 +373,7 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then
|
||||
// Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Single, then
|
||||
// this location will be deleted in the general case reconciliation that runs after this.
|
||||
let origin_secondary_conf = build_location_config(
|
||||
&self.shard,
|
||||
@@ -484,19 +382,22 @@ impl Reconciler {
|
||||
None,
|
||||
Some(LocationConfigSecondary { warm: true }),
|
||||
);
|
||||
self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false)
|
||||
self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
|
||||
.await?;
|
||||
// TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
|
||||
// partway through. In fact, all location conf API calls should be in a wrapper that sets
|
||||
// the observed state to None, then runs, then sets it to what we wrote.
|
||||
self.observed.locations.insert(
|
||||
origin_ps.get_id(),
|
||||
origin_ps_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(origin_secondary_conf),
|
||||
},
|
||||
);
|
||||
|
||||
tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
|
||||
println!(
|
||||
"🔁 Switching to AttachedSingle mode on pageserver {}",
|
||||
dest_ps_id
|
||||
);
|
||||
let dest_final_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
@@ -504,73 +405,16 @@ impl Reconciler {
|
||||
self.generation,
|
||||
None,
|
||||
);
|
||||
self.location_config(&dest_ps, dest_final_conf.clone(), None, false)
|
||||
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
|
||||
.await?;
|
||||
self.observed.locations.insert(
|
||||
dest_ps.get_id(),
|
||||
dest_ps_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(dest_final_conf),
|
||||
},
|
||||
);
|
||||
|
||||
tracing::info!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
|
||||
// If the attached node has uncertain state, read it from the pageserver before proceeding: this
|
||||
// is important to avoid spurious generation increments.
|
||||
//
|
||||
// We don't need to do this for secondary/detach locations because it's harmless to just PUT their
|
||||
// location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate
|
||||
// the `Timeline` object in the pageserver.
|
||||
|
||||
let Some(attached_node) = self.intent.attached.as_ref() else {
|
||||
// Nothing to do
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if matches!(
|
||||
self.observed.locations.get(&attached_node.get_id()),
|
||||
Some(ObservedStateLocation { conf: None })
|
||||
) {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
let observed_conf = match attached_node
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_location_config(tenant_shard_id).await },
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
1,
|
||||
Duration::from_secs(5),
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(observed)) => Some(observed),
|
||||
Some(Err(mgmt_api::Error::ApiError(status, _msg)))
|
||||
if status == StatusCode::NOT_FOUND =>
|
||||
{
|
||||
None
|
||||
}
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Err(ReconcileError::Cancel),
|
||||
};
|
||||
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
||||
match observed_conf {
|
||||
Some(conf) => {
|
||||
// Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state,
|
||||
// if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
|
||||
self.observed
|
||||
.locations
|
||||
.insert(attached_node.get_id(), ObservedStateLocation { conf });
|
||||
}
|
||||
None => {
|
||||
// Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
|
||||
self.observed.locations.remove(&attached_node.get_id());
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("✅ Migration complete");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -582,14 +426,14 @@ impl Reconciler {
|
||||
/// general case reconciliation where we walk through the intent by pageserver
|
||||
/// and call out to the pageserver to apply the desired state.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
|
||||
self.maybe_refresh_observed().await?;
|
||||
// TODO: if any of self.observed is None, call to remote pageservers
|
||||
// to learn correct state.
|
||||
|
||||
// Special case: live migration
|
||||
self.maybe_live_migrate().await?;
|
||||
|
||||
// If the attached pageserver is not attached, do so now.
|
||||
if let Some(node) = self.intent.attached.as_ref() {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
// If we are in an attached policy, then generation must have been set (null generations
|
||||
// are only present when a tenant is initially loaded with a secondary policy)
|
||||
debug_assert!(self.generation.is_some());
|
||||
@@ -599,16 +443,11 @@ impl Reconciler {
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
observed => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
@@ -646,21 +485,13 @@ impl Reconciler {
|
||||
if increment_generation {
|
||||
let generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node.get_id())
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
self.generation = Some(generation);
|
||||
wanted_conf.generation = generation.into();
|
||||
}
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
|
||||
// Because `node` comes from a ref to &self, clone it before calling into a &mut self
|
||||
// function: this could be avoided by refactoring the state mutated by location_config into
|
||||
// a separate type to Self.
|
||||
let node = node.clone();
|
||||
|
||||
// Use lazy=true, because we may run many of Self concurrently, and do not want to
|
||||
// overload the pageserver with logical size calculations.
|
||||
self.location_config(&node, wanted_conf, None, true).await?;
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
self.compute_notify().await?;
|
||||
}
|
||||
}
|
||||
@@ -669,27 +500,33 @@ impl Reconciler {
|
||||
// Configure secondary locations: if these were previously attached this
|
||||
// implicitly downgrades them from attached to secondary.
|
||||
let mut changes = Vec::new();
|
||||
for node in &self.intent.secondary {
|
||||
for node_id in &self.intent.secondary {
|
||||
let wanted_conf = secondary_location_conf(&self.shard, &self.config);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location.
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
changes.push((node.clone(), wanted_conf))
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
changes.push((*node_id, wanted_conf))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detach any extraneous pageservers that are no longer referenced
|
||||
// by our intent.
|
||||
for node in &self.detach {
|
||||
let all_pageservers = self.intent.all_pageservers();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if all_pageservers.contains(node_id) {
|
||||
// We are only detaching pageservers that aren't used at all.
|
||||
continue;
|
||||
}
|
||||
|
||||
changes.push((
|
||||
node.clone(),
|
||||
*node_id,
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::Detached,
|
||||
generation: None,
|
||||
@@ -702,11 +539,11 @@ impl Reconciler {
|
||||
));
|
||||
}
|
||||
|
||||
for (node, conf) in changes {
|
||||
for (node_id, conf) in changes {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
self.location_config(node_id, conf, None).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -715,21 +552,16 @@ impl Reconciler {
|
||||
pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> {
|
||||
// Whenever a particular Reconciler emits a notification, it is always notifying for the intended
|
||||
// destination.
|
||||
if let Some(node) = &self.intent.attached {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
let result = self
|
||||
.compute_hook
|
||||
.notify(
|
||||
self.tenant_shard_id,
|
||||
node.get_id(),
|
||||
self.shard.stripe_size,
|
||||
&self.cancel,
|
||||
)
|
||||
.notify(self.tenant_shard_id, node_id, &self.cancel)
|
||||
.await;
|
||||
if let Err(e) = &result {
|
||||
// It is up to the caller whether they want to drop out on this error, but they don't have to:
|
||||
// in general we should avoid letting unavailability of the cloud control plane stop us from
|
||||
// making progress.
|
||||
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
|
||||
tracing::warn!("Failed to notify compute of attached pageserver {node_id}: {e}");
|
||||
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
|
||||
// needs to retry at some point.
|
||||
self.compute_notify_failure = true;
|
||||
@@ -741,26 +573,10 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
/// We tweak the externally-set TenantConfig while configuring
|
||||
/// locations, using our awareness of whether secondary locations
|
||||
/// are in use to automatically enable/disable heatmap uploads.
|
||||
fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
|
||||
let mut config = config.clone();
|
||||
if has_secondaries {
|
||||
if config.heatmap_period.is_none() {
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
||||
}
|
||||
} else {
|
||||
config.heatmap_period = None;
|
||||
}
|
||||
config
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
has_secondaries: bool,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
@@ -769,7 +585,7 @@ pub(crate) fn attached_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: ha_aware_config(config, has_secondaries),
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -784,6 +600,6 @@ pub(crate) fn secondary_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: ha_aware_config(config, true),
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
pub enum MaySchedule {
|
||||
Yes(UtilizationScore),
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: MaySchedule,
|
||||
may_schedule: bool,
|
||||
}
|
||||
|
||||
impl PartialEq for SchedulerNode {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let may_schedule_matches = matches!(
|
||||
(&self.may_schedule, &other.may_schedule),
|
||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||
);
|
||||
|
||||
may_schedule_matches && self.shard_count == other.shard_count
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for SchedulerNode {}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
@@ -63,7 +43,7 @@ impl Scheduler {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
for node in nodes {
|
||||
scheduler_nodes.insert(
|
||||
node.get_id(),
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
@@ -88,7 +68,7 @@ impl Scheduler {
|
||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||
for node in nodes {
|
||||
expect_nodes.insert(
|
||||
node.get_id(),
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
@@ -176,7 +156,7 @@ impl Scheduler {
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
match self.nodes.entry(node.get_id()) {
|
||||
match self.nodes.entry(node.id) {
|
||||
Occupied(mut entry) => {
|
||||
entry.get_mut().may_schedule = node.may_schedule();
|
||||
}
|
||||
@@ -206,15 +186,13 @@ impl Scheduler {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||
// schedule based on that instead of the shard count.
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule != MaySchedule::No)
|
||||
.map(|n| n.may_schedule)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
@@ -233,7 +211,7 @@ impl Scheduler {
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
None
|
||||
} else {
|
||||
Some((*k, v.shard_count))
|
||||
@@ -252,7 +230,7 @@ impl Scheduler {
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.may_schedule,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
@@ -277,7 +255,7 @@ impl Scheduler {
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
||||
use pageserver_api::controller_api::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
@@ -286,18 +264,18 @@ pub(crate) mod test_utils {
|
||||
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(NodeId(i), {
|
||||
let mut node = Node::new(
|
||||
NodeId(i),
|
||||
format!("httphost-{i}"),
|
||||
80 + i as u16,
|
||||
format!("pghost-{i}"),
|
||||
5432 + i as u16,
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
assert!(node.is_available());
|
||||
node
|
||||
})
|
||||
(
|
||||
NodeId(i),
|
||||
Node {
|
||||
id: NodeId(i),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: format!("httphost-{i}"),
|
||||
listen_http_port: 80 + i as u16,
|
||||
listen_pg_addr: format!("pghost-{i}"),
|
||||
listen_pg_port: 5432 + i as u16,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,7 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::controller_api::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
@@ -29,7 +25,7 @@ use crate::{
|
||||
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
|
||||
},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, Sequence,
|
||||
service, PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
/// Serialization helper
|
||||
@@ -374,7 +370,7 @@ impl TenantState {
|
||||
/// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next,
|
||||
/// to get an intent state that complies with placement policy. The overall goal is to do scheduling
|
||||
/// in a way that makes use of any configured locations that already exist in the outside world.
|
||||
pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) {
|
||||
pub(crate) fn intent_from_observed(&mut self) {
|
||||
// Choose an attached location by filtering observed locations, and then sorting to get the highest
|
||||
// generation
|
||||
let mut attached_locs = self
|
||||
@@ -399,7 +395,7 @@ impl TenantState {
|
||||
|
||||
attached_locs.sort_by_key(|i| i.1);
|
||||
if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
|
||||
self.intent.set_attached(scheduler, Some(*node_id));
|
||||
self.intent.attached = Some(*node_id);
|
||||
}
|
||||
|
||||
// All remaining observed locations generate secondary intents. This includes None
|
||||
@@ -410,7 +406,7 @@ impl TenantState {
|
||||
// will take care of promoting one of these secondaries to be attached.
|
||||
self.observed.locations.keys().for_each(|node_id| {
|
||||
if Some(*node_id) != self.intent.attached {
|
||||
self.intent.push_secondary(scheduler, *node_id);
|
||||
self.intent.secondary.push(*node_id);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -457,7 +453,22 @@ impl TenantState {
|
||||
// Add/remove nodes to fulfil policy
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Attached(secondary_count) => {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
let retain_secondaries = if self.intent.attached.is_none()
|
||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||
{
|
||||
@@ -553,25 +564,18 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
fn dirty(&self, nodes: &Arc<HashMap<NodeId, Node>>) -> bool {
|
||||
let mut dirty_nodes = HashSet::new();
|
||||
|
||||
fn dirty(&self) -> bool {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
// Maybe panic: it is a severe bug if we try to attach while generation is null.
|
||||
let generation = self
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
dirty_nodes.insert(node_id);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -581,7 +585,7 @@ impl TenantState {
|
||||
match self.observed.locations.get(node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
dirty_nodes.insert(*node_id);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -589,25 +593,24 @@ impl TenantState {
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
|
||||
// We have observed state that isn't part of our intent: need to clean it up.
|
||||
dirty_nodes.insert(*node_id);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
dirty_nodes.retain(|node_id| {
|
||||
nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.is_available())
|
||||
.unwrap_or(false)
|
||||
});
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
if self.pending_compute_notification {
|
||||
return true;
|
||||
}
|
||||
|
||||
!dirty_nodes.is_empty()
|
||||
false
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
@@ -622,20 +625,15 @@ impl TenantState {
|
||||
let node = pageservers
|
||||
.get(node_id)
|
||||
.expect("Nodes may not be removed while referenced");
|
||||
if observed_loc.conf.is_none() && node.is_available() {
|
||||
if observed_loc.conf.is_none()
|
||||
&& !matches!(node.availability, NodeAvailability::Offline)
|
||||
{
|
||||
dirty_observed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let active_nodes_dirty = self.dirty(pageservers);
|
||||
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
let do_reconcile =
|
||||
active_nodes_dirty || dirty_observed || self.pending_compute_notification;
|
||||
|
||||
if !do_reconcile {
|
||||
if !self.dirty() && !dirty_observed {
|
||||
tracing::info!("Not dirty, no reconciliation needed.");
|
||||
return None;
|
||||
}
|
||||
@@ -665,21 +663,6 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
// Build list of nodes from which the reconciler should detach
|
||||
let mut detach = Vec::new();
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if self.intent.get_attached() != &Some(*node_id)
|
||||
&& !self.intent.secondary.contains(node_id)
|
||||
{
|
||||
detach.push(
|
||||
pageservers
|
||||
.get(node_id)
|
||||
.expect("Intent references non-existent pageserver")
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
@@ -694,15 +677,14 @@ impl TenantState {
|
||||
self.sequence = self.sequence.next();
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: reconciler_intent,
|
||||
detach,
|
||||
intent: TargetState::from_intent(&self.intent),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
service_config: service_config.clone(),
|
||||
_gate_guard: gate_guard,
|
||||
@@ -719,7 +701,6 @@ impl TenantState {
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
@@ -838,10 +819,7 @@ impl TenantState {
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::{
|
||||
controller_api::NodeAvailability,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
@@ -880,7 +858,7 @@ pub(crate) mod tests {
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
@@ -900,10 +878,7 @@ pub(crate) mod tests {
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 2);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes
|
||||
.get_mut(&attached_node_id)
|
||||
.unwrap()
|
||||
.set_availability(NodeAvailability::Offline);
|
||||
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
|
||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
@@ -922,54 +897,4 @@ pub(crate) mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn intent_from_observed() -> anyhow::Result<()> {
|
||||
let nodes = make_test_nodes(3);
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
|
||||
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(3),
|
||||
ObservedStateLocation {
|
||||
conf: Some(LocationConfig {
|
||||
mode: LocationConfigMode::AttachedMulti,
|
||||
generation: Some(2),
|
||||
secondary_conf: None,
|
||||
shard_number: tenant_state.shard.number.0,
|
||||
shard_count: tenant_state.shard.count.literal(),
|
||||
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
tenant_state.observed.locations.insert(
|
||||
NodeId(2),
|
||||
ObservedStateLocation {
|
||||
conf: Some(LocationConfig {
|
||||
mode: LocationConfigMode::AttachedStale,
|
||||
generation: Some(1),
|
||||
secondary_conf: None,
|
||||
shard_number: tenant_state.shard.number.0,
|
||||
shard_count: tenant_state.shard.count.literal(),
|
||||
shard_stripe_size: tenant_state.shard.stripe_size.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
tenant_state.intent_from_observed(&mut scheduler);
|
||||
|
||||
// The highest generationed attached location gets used as attached
|
||||
assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
|
||||
// Other locations get used as secondary
|
||||
assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
|
||||
|
||||
scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use pageserver_api::{
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -24,7 +24,7 @@ use utils::{
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
pub struct StorageController {
|
||||
pub struct AttachmentService {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
@@ -34,12 +34,9 @@ pub struct StorageController {
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
@@ -62,7 +59,7 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
impl StorageController {
|
||||
impl AttachmentService {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
@@ -139,27 +136,27 @@ impl StorageController {
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("storage_controller_postgres.pid"),
|
||||
.join("attachment_service_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
@@ -192,7 +189,7 @@ impl StorageController {
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
const DB_NAME: &str = "attachment_service";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -222,10 +219,10 @@ impl StorageController {
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
@@ -248,7 +245,7 @@ impl StorageController {
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
println!("Starting attachment service database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
@@ -259,7 +256,7 @@ impl StorageController {
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
"attachment_service_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
@@ -272,8 +269,6 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
@@ -281,8 +276,6 @@ impl StorageController {
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
&max_unavailable.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
@@ -307,7 +300,7 @@ impl StorageController {
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
&self.env.attachment_service_bin(),
|
||||
args,
|
||||
[(
|
||||
"NEON_REPO_DIR".to_string(),
|
||||
@@ -329,10 +322,10 @@ impl StorageController {
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping storage controller database...");
|
||||
println!("Stopping attachment service database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
@@ -351,10 +344,10 @@ impl StorageController {
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Storage controller database is already stopped");
|
||||
println!("Attachment service data base is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -375,7 +368,7 @@ impl StorageController {
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
@@ -475,7 +468,7 @@ impl StorageController {
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("debug/v1/tenant/{tenant_id}/locate"),
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
@@ -503,15 +496,11 @@ impl StorageController {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
}),
|
||||
Some(TenantShardSplitRequest { new_shard_count }),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -8,14 +8,14 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
|
||||
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
@@ -435,24 +435,19 @@ async fn handle_tenant(
|
||||
let shard_stripe_size: Option<u32> =
|
||||
create_match.get_one::<u32>("shard-stripe-size").cloned();
|
||||
|
||||
let placement_policy = match create_match.get_one::<String>("placement-policy") {
|
||||
Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
|
||||
_ => PlacementPolicy::Attached(0),
|
||||
};
|
||||
|
||||
let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
|
||||
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
// We must register the tenant with the storage controller, so
|
||||
// We must register the tenant with the attachment service, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_create(TenantCreateRequest {
|
||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||
// storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
|
||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
@@ -461,7 +456,6 @@ async fn handle_tenant(
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
},
|
||||
placement_policy: Some(placement_policy),
|
||||
config: tenant_conf,
|
||||
})
|
||||
.await?;
|
||||
@@ -476,9 +470,9 @@ async fn handle_tenant(
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||
// different shards picking different start lsns. Maybe we have to teach storage controller
|
||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||
storage_controller
|
||||
attachment_service
|
||||
.tenant_timeline_create(
|
||||
tenant_id,
|
||||
TimelineCreateRequest {
|
||||
@@ -523,6 +517,84 @@ async fn handle_tenant(
|
||||
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
|
||||
println!("tenant {tenant_id} successfully configured on the pageserver");
|
||||
}
|
||||
Some(("migrate", matches)) => {
|
||||
let tenant_shard_id = get_tenant_shard_id(matches, env)?;
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
|
||||
}
|
||||
Some(("status", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
|
||||
let mut shard_table = comfy_table::Table::new();
|
||||
shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
let size = pageserver
|
||||
.http_client
|
||||
.tenant_details(shard.shard_id)
|
||||
.await?
|
||||
.tenant_info
|
||||
.current_physical_size
|
||||
.unwrap();
|
||||
|
||||
shard_table.add_row([
|
||||
format!("{}", shard.shard_id.shard_slug()),
|
||||
format!("{}", shard.node_id.0),
|
||||
format!("{} MiB", size / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
if shard.shard_id.is_zero() {
|
||||
tenant_synthetic_size =
|
||||
Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(synthetic_size) = tenant_synthetic_size else {
|
||||
bail!("Shard 0 not found")
|
||||
};
|
||||
|
||||
let mut tenant_table = comfy_table::Table::new();
|
||||
tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
|
||||
tenant_table.add_row([
|
||||
"Synthetic size".to_string(),
|
||||
format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
|
||||
]);
|
||||
|
||||
println!("{tenant_table}");
|
||||
println!("{shard_table}");
|
||||
}
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let result = attachment_service
|
||||
.tenant_split(tenant_id, shard_count)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
tenant_id,
|
||||
result
|
||||
.new_shards
|
||||
.iter()
|
||||
.map(|s| format!("{:?}", s))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
}
|
||||
|
||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||
None => bail!("no tenant subcommand provided"),
|
||||
@@ -535,7 +607,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
@@ -555,7 +627,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
@@ -563,7 +635,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = storage_controller
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -652,7 +724,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let new_timeline_id = TimelineId::generate();
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
@@ -660,7 +732,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
};
|
||||
let timeline_info = storage_controller
|
||||
let timeline_info = attachment_service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -689,7 +761,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||
@@ -874,21 +946,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
// full managed by attachment service, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -937,8 +1009,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
@@ -946,13 +1018,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported malformed host"),
|
||||
.expect("Attachment service reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
endpoint.reconfigure(pageservers, None).await?;
|
||||
endpoint.reconfigure(pageservers).await?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
@@ -1022,8 +1094,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1052,7 +1125,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1065,8 +1138,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
@@ -1091,11 +1164,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_storage_controller(
|
||||
async fn handle_attachment_service(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> Result<()> {
|
||||
let svc = StorageController::from_env(env);
|
||||
let svc = AttachmentService::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
@@ -1115,8 +1188,8 @@ async fn handle_storage_controller(
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
|
||||
None => bail!("no storage_controller subcommand provided"),
|
||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
||||
None => bail!("no attachment_service subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1201,11 +1274,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start().await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
@@ -1214,7 +1287,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
@@ -1277,9 +1350,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1489,13 +1562,24 @@ fn cli() -> Command {
|
||||
.help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
.arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant"))
|
||||
)
|
||||
.subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
|
||||
.about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
|
||||
.subcommand(Command::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
|
||||
.subcommand(Command::new("migrate")
|
||||
.about("Migrate a tenant from one pageserver to another")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(pageserver_id_arg.clone()))
|
||||
.subcommand(Command::new("status")
|
||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
.subcommand(Command::new("shard-split")
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("pageserver")
|
||||
@@ -1505,7 +1589,11 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1523,9 +1611,9 @@ fn cli() -> Command {
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_controller")
|
||||
Command::new("attachment_service")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.about("Manage attachment_service")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
|
||||
@@ -52,14 +52,13 @@ use compute_api::spec::RemoteExtSpec;
|
||||
use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
@@ -656,7 +655,7 @@ impl Endpoint {
|
||||
// Wait for it to start
|
||||
let mut attempt = 0;
|
||||
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
|
||||
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
|
||||
loop {
|
||||
attempt += 1;
|
||||
match self.get_status().await {
|
||||
@@ -736,11 +735,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<()> {
|
||||
pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
@@ -750,17 +745,17 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
// If we weren't given explicit pageservers, query the attachment service
|
||||
if pageservers.is_empty() {
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -770,14 +765,8 @@ impl Endpoint {
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstr.is_empty());
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.unwrap();
|
||||
let client = reqwest::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
//! local installations.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
@@ -13,4 +14,3 @@ pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage_controller;
|
||||
|
||||
@@ -72,13 +72,13 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
|
||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||
// Control plane upcall API for pageserver: if None, we will not run attachment_service. If set, this will
|
||||
// be propagated into each pageserver's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_api: Option<Url>,
|
||||
|
||||
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
||||
// storage controller's configuration.
|
||||
// Control plane upcall API for attachment service. If set, this will be propagated into the
|
||||
// attachment service's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
|
||||
@@ -114,7 +114,7 @@ impl NeonBroker {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
#[serde(default)]
|
||||
pub struct PageServerConf {
|
||||
// node id
|
||||
pub id: NodeId,
|
||||
@@ -126,9 +126,6 @@ pub struct PageServerConf {
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
pub(crate) virtual_file_io_engine: Option<String>,
|
||||
pub(crate) get_vectored_impl: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
virtual_file_io_engine: None,
|
||||
get_vectored_impl: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -232,12 +227,12 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn storage_controller_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, storage controller binary is always
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest storage controller.
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("storage_controller")
|
||||
neon_local_bin_dir.join("attachment_service")
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> PathBuf {
|
||||
|
||||
@@ -17,6 +17,7 @@ use std::time::Duration;
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -30,6 +31,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -78,39 +80,18 @@ impl PageServerNode {
|
||||
///
|
||||
/// These all end up on the command line of the `pageserver` binary.
|
||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||
let id = format!("id={}", self.conf.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
self.env.pg_distrib_dir_raw().display()
|
||||
);
|
||||
|
||||
let PageServerConf {
|
||||
id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
} = &self.conf;
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
|
||||
|
||||
let id = format!("id={}", id);
|
||||
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
|
||||
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
|
||||
format!("get_vectored_impl='{get_vectored_impl}'")
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -122,8 +103,6 @@ impl PageServerNode {
|
||||
listen_http_addr_param,
|
||||
listen_pg_addr_param,
|
||||
broker_endpoint_param,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||
@@ -132,9 +111,9 @@ impl PageServerNode {
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
|
||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
||||
// for us, we will also need it to talk to them.
|
||||
if matches!(http_auth_type, AuthType::NeonJWT) {
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||
@@ -152,7 +131,8 @@ impl PageServerNode {
|
||||
));
|
||||
}
|
||||
|
||||
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
||||
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||
// are one level below that, so refer to keys with ../
|
||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||
@@ -183,8 +163,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false).await
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -222,28 +202,6 @@ impl PageServerNode {
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
// the storage controller
|
||||
let metadata_path = datadir.join("metadata.json");
|
||||
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
std::fs::write(
|
||||
metadata_path,
|
||||
serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": self.pg_connection_config.port(),
|
||||
"http_host": "localhost",
|
||||
"http_port": http_port,
|
||||
}))
|
||||
.unwrap(),
|
||||
)
|
||||
.expect("Failed to write metadata file");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -251,7 +209,27 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -451,8 +429,6 @@ impl PageServerNode {
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
// Placement policy is not meaningful for creations not done via storage controller
|
||||
placement_policy: None,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -561,11 +537,10 @@ impl PageServerNode {
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||
.location_config(tenant_shard_id, config, flush_ms)
|
||||
.await?)
|
||||
}
|
||||
|
||||
@@ -576,6 +551,13 @@ impl PageServerNode {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -623,7 +605,7 @@ impl PageServerNode {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let client = std::pin::pin!(client);
|
||||
tokio::pin!(client);
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
|
||||
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
|
||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
||||
|
||||
"admin": Provides access to the control plane and admin APIs of the storage controller.
|
||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
@@ -1,408 +0,0 @@
|
||||
# Sharding Phase 1: Static Key-space Sharding
|
||||
|
||||
## Summary
|
||||
|
||||
To enable databases with sizes approaching the capacity of a pageserver's disk,
|
||||
it is necessary to break up the storage for the database, or _shard_ it.
|
||||
|
||||
Sharding in general is a complex area. This RFC aims to define an initial
|
||||
capability that will permit creating large-capacity databases using a static configuration
|
||||
defined at time of Tenant creation.
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently, all data for a Tenant, including all its timelines, is stored on a single
|
||||
pageserver. The local storage required may be several times larger than the actual
|
||||
database size, due to LSM write inflation.
|
||||
|
||||
If a database is larger than what one pageserver can hold, then it becomes impossible
|
||||
for the pageserver to hold it in local storage, as it must do to provide service to
|
||||
clients.
|
||||
|
||||
### Prior art
|
||||
|
||||
In Neon:
|
||||
|
||||
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
|
||||
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
|
||||
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
|
||||
|
||||
Prior art in other distributed systems is too broad to capture here: pretty much
|
||||
any scale out storage system does something like this.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Enable creating a large (for example, 16TiB) database without requiring dedicated
|
||||
pageserver nodes.
|
||||
- Share read/write bandwidth costs for large databases across pageservers, as well
|
||||
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
|
||||
that disrupt service to other tenants.
|
||||
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
|
||||
does not write out a single contiguous ranges of page numbers.
|
||||
|
||||
_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
|
||||
that a user might create on a current-gen enterprise SSD should also work well on
|
||||
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
|
||||
pageserver backend is not the limiting factor in the database size_.
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Independently distributing timelines within the same tenant. If a tenant has many
|
||||
timelines, then sharding may be a less efficient mechanism for distributing load than
|
||||
sharing out timelines between pageservers.
|
||||
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
|
||||
based on the idea that separate mechanisms will make sense for each dimension.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, control plane, postgres/smgr
|
||||
|
||||
## Terminology
|
||||
|
||||
**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
|
||||
the page number is the key in that store. `Key` is a literal data type in existing code.
|
||||
|
||||
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
|
||||
of keys and LSNs as a two dimensional space.
|
||||
|
||||
## Implementation
|
||||
|
||||
### Key sharding vs. LSN sharding
|
||||
|
||||
When we think of sharding across the two dimensional key/lsn space, this is an
|
||||
opportunity to think about how the two dimensions differ:
|
||||
|
||||
- Sharding the key space distributes the _write_ workload of ingesting data
|
||||
and compacting. This work must be carefully managed so that exactly one
|
||||
node owns a given key.
|
||||
- Sharding the LSN space distributes the _historical read_ workload. This work
|
||||
can be done by anyone without any special coordination, as long as they can
|
||||
see the remote index and layers.
|
||||
|
||||
The key sharding is the harder part, and also the more urgent one, to support larger
|
||||
capacity databases. Because distributing historical LSN read work is a relatively
|
||||
simpler problem that most users don't have, we defer it to future work. It is anticipated
|
||||
that some quite simple P2P offload model will enable distributing work for historical
|
||||
reads: a node which is low on space can call out to peer to ask it to download and
|
||||
serve reads from a historical layer.
|
||||
|
||||
### Key mapping scheme
|
||||
|
||||
Having decided to focus on key sharding, we must next decide how we will map
|
||||
keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
|
||||
between data locality and avoiding entire large relations mapping to the same shard.
|
||||
|
||||
We will define two spaces:
|
||||
|
||||
- Key space: unsigned integer
|
||||
- Shard space: integer from 0 to N-1, where we have N shards.
|
||||
|
||||
### Key -> Shard mapping
|
||||
|
||||
Keys are currently defined in the pageserver's getpage@lsn interface as follows:
|
||||
|
||||
```
|
||||
pub struct Key {
|
||||
pub field1: u8,
|
||||
pub field2: u32,
|
||||
pub field3: u32,
|
||||
pub field4: u32,
|
||||
pub field5: u8,
|
||||
pub field6: u32,
|
||||
}
|
||||
|
||||
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
|
||||
shards. For distribution purposes, we only care about user data keys_
|
||||
|
||||
The properties we want from our Key->Shard mapping are:
|
||||
|
||||
- Locality in `blknum`, such that adjacent `blknum` will usually map to
|
||||
the same stripe and consequently land on the same shard, even though the overall
|
||||
collection of blocks in a relation will be spread over many stripes and therefore
|
||||
many shards.
|
||||
- Avoid the same blknum on different relations landing on the same stripe, so that
|
||||
with many small relations we do not end up aliasing data to the same stripe/shard.
|
||||
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
|
||||
if there are patterns in the value of `relnode`, these do not manifest as patterns
|
||||
in data placement.
|
||||
|
||||
To accomplish this, the blknum is used to select a stripe, and stripes are
|
||||
assigned to shards in a pseudorandom order via a hash. The motivation for
|
||||
pseudo-random distribution (rather than sequential mapping of stripe to shard)
|
||||
is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
|
||||
all relations' stripes to touch pageservers in the same order.
|
||||
|
||||
To map a `Key` to a shard:
|
||||
|
||||
- Hash the `Key` field 4 (relNode).
|
||||
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
|
||||
hash of this with the hash from the previous step.
|
||||
- The total hash modulo the shard count gives the shard holding this key.
|
||||
|
||||
Why don't we use the other fields in the Key?
|
||||
|
||||
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
|
||||
in the same relation, and we would like to keep the data in a relation together.
|
||||
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
|
||||
database's blocks differ only by spcNode and dbNode from the original. To enable running
|
||||
this type of creation without cross-pageserver communication, we must ensure that these
|
||||
blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
|
||||
|
||||
### Data placement examples
|
||||
|
||||
For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
|
||||
and a stripe size of 32k pages:
|
||||
|
||||
- A single large relation: `blknum` division will break the data up into 4096
|
||||
stripes, which will be scattered across the shards.
|
||||
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
|
||||
and that stripe will be placed according to the hash of the key fields 4. The
|
||||
data placement will be statistically uniform across shards.
|
||||
|
||||
Data placement will be more uneven on smaller databases:
|
||||
|
||||
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
|
||||
that both relations land on the same shard and no data lands on the other shard.
|
||||
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
|
||||
the data of the other four shards.
|
||||
|
||||
These uneven cases for small amounts of data do not matter, as long as the stripe size
|
||||
is an order of magnitude smaller than the amount of data we are comfortable holding
|
||||
in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
|
||||
a tenant has some shards with 256MB size and some shards with 512MB size, even though
|
||||
the standard deviation of shard size within the tenant is very high. Our key mapping
|
||||
scheme provides a statistical guarantee that as the tenant's overall data size increases,
|
||||
uniformity of placement will improve.
|
||||
|
||||
### Important Types
|
||||
|
||||
#### `ShardIdentity`
|
||||
|
||||
Provides the information needed to know whether a particular key belongs
|
||||
to a particular shard:
|
||||
|
||||
- Layout version
|
||||
- Stripe size
|
||||
- Shard count
|
||||
- Shard index
|
||||
|
||||
This structure's size is constant. Note that if we had used a differnet key
|
||||
mapping scheme such as consistent hashing with explicit hash ranges assigned
|
||||
to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
|
||||
key mapping scheme used here enables a small fixed size ShardIdentity.
|
||||
|
||||
### Pageserver changes
|
||||
|
||||
#### Structural
|
||||
|
||||
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
|
||||
`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
|
||||
of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
|
||||
covers the whole keyspace.
|
||||
|
||||
When the pageserver writes layers and index_part.json to remote storage, it must
|
||||
include the shard index & count in the name, to avoid collisions (the count is
|
||||
necessary for future-proofing: the count will vary in time). These keys
|
||||
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
|
||||
exactly the same for TenantShards as it does for Tenants today: each shard will have
|
||||
its own generation number.
|
||||
|
||||
#### Storage Format: Keys
|
||||
|
||||
For tenants with >1 shard, layer files implicitly become sparse: within the key
|
||||
range described in the layer name, the layer file for a shard will only hold the
|
||||
content relevant to stripes assigned to the shard.
|
||||
|
||||
For this reason, the LayerFileName within a tenant is no longer unique: different shards
|
||||
may use the same LayerFileName to refer to different data. We may solve this simply
|
||||
by including the shard number in the keys used for layers.
|
||||
|
||||
The shard number will be included as a prefix (as part of tenant ID), like this:
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
|
||||
|
||||
Reasons for this particular format:
|
||||
|
||||
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
|
||||
we construct a layer file name), and enables efficient listing of index_parts within
|
||||
a particular shard-timeline prefix.
|
||||
- Including the shard _count_ as well as shard number means that in future when we implement
|
||||
shard splitting, it will be possible for a parent shard and one of its children to write
|
||||
the same layer file without a name collision. For example, a parent shard 0_1 might split
|
||||
into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
|
||||
that is distinct from what shard 0_1 would have written at the same place.
|
||||
|
||||
In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
|
||||
and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
|
||||
for example a single-shard tenant's prefix will be `0001`.
|
||||
|
||||
For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
|
||||
and use this as a cue to construct paths with no prefix at all.
|
||||
|
||||
#### Storage Format: Indices
|
||||
|
||||
In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
|
||||
when we implement shard splitting in future, it will be useful to enable shards to reference layers
|
||||
written by other shards (specifically the parent shard during a split), so that shards don't
|
||||
have to exhaustively copy all data into their own shard-prefixed keys.
|
||||
|
||||
To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
|
||||
tuple on each layer, such that it can construct paths for layers written by other shards. This
|
||||
naturally raises the question of who "owns" such layers written by ancestral shards: this problem
|
||||
will be addressed in phase 2.
|
||||
|
||||
For backward compatibility, any index entry without shard information will be assumed to be
|
||||
in the legacy shardidentity.
|
||||
|
||||
#### WAL Ingest
|
||||
|
||||
In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
|
||||
it down to the pages relevant to their shard:
|
||||
|
||||
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
|
||||
- For metadata describing relations etc, all shards retain these writes.
|
||||
|
||||
The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
|
||||
one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
|
||||
and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
|
||||
expensive: if the safekeeper can be made shard-aware then it could be taught to use
|
||||
the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
|
||||
|
||||
#### Compaction/GC
|
||||
|
||||
No changes needed.
|
||||
|
||||
The pageserver doesn't have to do anything special during compaction
|
||||
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
|
||||
This will result in sparse layer files, containing keys only in the stripes that this
|
||||
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
|
||||
the key range, these should be updated to ignore gaps that are due to sharding, to
|
||||
avoid spuriously splitting up layers ito stripe-sized pieces.
|
||||
|
||||
### Compute Endpoints
|
||||
|
||||
Compute endpoints will need to:
|
||||
|
||||
- Accept a vector of connection strings as part of their configuration from the control plane
|
||||
- Route pageserver requests according to mapping the hash of key to the correct
|
||||
entry in the vector of connection strings.
|
||||
|
||||
Doing this in compute rather than routing requests via a single pageserver is
|
||||
necessary to enable sharding tenants without adding latency from extra hops.
|
||||
|
||||
### Control Plane
|
||||
|
||||
Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
|
||||
be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
|
||||
tenants.
|
||||
|
||||
Tenant lifecycle operations like deletion will require fanning-out to all the shards
|
||||
in the tenant. The same goes for timeline creation and deletion: a timeline should
|
||||
not be considered created until it has been created in all shards.
|
||||
|
||||
#### Selectively enabling sharding for large tenants
|
||||
|
||||
Initially, we will explicitly enable sharding for large tenants only.
|
||||
|
||||
In future, this hint mechanism will become optional when we implement automatic
|
||||
re-sharding of tenants.
|
||||
|
||||
## Future Phases
|
||||
|
||||
This section exists to indicate what will likely come next after this phase.
|
||||
|
||||
Phases 2a and 2b are amenable to execution in parallel.
|
||||
|
||||
### Phase 2a: WAL fan-out
|
||||
|
||||
**Problem**: when all shards consume the whole WAL, the network bandwidth used
|
||||
for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
|
||||
of the shard count.
|
||||
|
||||
Network bandwidth is not our most pressing bottleneck, but it is likely to become
|
||||
a problem if we set a modest shard count (~8) on a significant number of tenants,
|
||||
especially as those larger tenants which we shard are also likely to have higher
|
||||
write bandwidth than average.
|
||||
|
||||
### Phase 2b: Shard Splitting
|
||||
|
||||
**Problem**: the number of shards in a tenant is defined at creation time and cannot
|
||||
be changed. This causes excessive sharding for most small tenants, and an upper
|
||||
bound on scale for very large tenants.
|
||||
|
||||
To address this, a _splitting_ feature will later be added. One shard can split its
|
||||
data into a number of children by doing a special compaction operation to generate
|
||||
image layers broken up child-shard-wise, and then writing out an `index_part.json` for
|
||||
each child. This will then require external coordination (by the control plane) to
|
||||
safely attach these new child shards and then move them around to distribute work.
|
||||
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
|
||||
once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
|
||||
the risk/complexity of implementing such a rarely-encountered scenario.
|
||||
|
||||
### Phase N (future): distributed historical reads
|
||||
|
||||
**Problem**: while sharding based on key is good for handling changes in overall
|
||||
database size, it is less suitable for spiky/unpredictable changes in the read
|
||||
workload to historical layers. Sudden increases in historical reads could result
|
||||
in sudden increases in local disk capacity required for a TenantShard.
|
||||
|
||||
Example: the extreme case of this would be to run a tenant for a year, then create branches
|
||||
with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
|
||||
the on-disk capacity footprint of a TenantShard, since it would be serving reads
|
||||
from all those disparate historical layers.
|
||||
|
||||
If we can respond fast enough, then key-sharding a tenant more finely can help with
|
||||
this, but splitting may be a relatively expensive operation and the increased historical
|
||||
read load may be transient.
|
||||
|
||||
A separate mechanism for handling heavy historical reads could be something like
|
||||
a gossip mechanism for pageservers to communicate
|
||||
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
|
||||
ask another to go read the necessary layers from remote storage to serve the read. This
|
||||
requires relativly little coordination because it is read-only: any node can service any
|
||||
read. All reads to a particular shard would still flow through one node, but the
|
||||
disk capactity & I/O impact of servicing the read would be distributed.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
|
||||
|
||||
When a database is growing under a write workload, writes may predominantly hit the
|
||||
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
|
||||
is intensively re-writing a particular relation, if that relation lived in a particular
|
||||
shard then it would not achieve our goal of distributing the write work across shards.
|
||||
|
||||
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
|
||||
|
||||
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
|
||||
database would still cause a load hotspot on the pageserver routing its read requests.
|
||||
2. The additional hop through the "proxy" pageserver would add latency and overall
|
||||
resource cost (CPU, network bandwidth)
|
||||
|
||||
### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
|
||||
|
||||
In this model, there would be no explicit sharding of work, but the pageserver to which
|
||||
a tenant is attached would not hold all layers on its disk: instead, it would call out
|
||||
to peers to have them store some layers, and call out to those peers to request reads
|
||||
in those layers.
|
||||
|
||||
This mechanism will work well for distributing work in the LSN dimension, but in the key
|
||||
space dimension it has the major limitation of requiring one node to handle all
|
||||
incoming writes, and compactions. Even if the write workload for a large database
|
||||
fits in one pageserver, it will still be a hotspot and such tenants may still
|
||||
de-facto require their own pageserver.
|
||||
@@ -1,479 +0,0 @@
|
||||
# Shard splitting
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes a new pageserver API for splitting an existing tenant shard into
|
||||
multiple shards, and describes how to use this API to safely increase the total
|
||||
shard count of a tenant.
|
||||
|
||||
## Motivation
|
||||
|
||||
In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
|
||||
tenants beyond the capacity of a single pageserver by breaking up the key space
|
||||
into stripes, and distributing these stripes across many pageservers. However,
|
||||
the shard count was defined once at tenant creation time and not varied thereafter.
|
||||
|
||||
In practice, the expected size of a database is rarely known at creation time, and
|
||||
it is inefficient to enable sharding for very small tenants: we need to be
|
||||
able to create a tenant with a small number of shards (such as 1), and later expand
|
||||
when it becomes clear that the tenant has grown in size to a point where sharding
|
||||
is beneficial.
|
||||
|
||||
### Prior art
|
||||
|
||||
Many distributed systems have the problem of choosing how many shards to create for
|
||||
tenants that do not specify an expected size up-front. There are a couple of general
|
||||
approaches:
|
||||
|
||||
- Write to a key space in order, and start a new shard when the highest key advances
|
||||
past some point. This doesn't work well for Neon, because we write to our key space
|
||||
in many different contiguous ranges (per relation), rather than in one contiguous
|
||||
range. To adapt to this kind of model, we would need a sharding scheme where each
|
||||
relation had its own range of shards, which would be inefficient for the common
|
||||
case of databases with many small relations.
|
||||
- Monitor the system, and automatically re-shard at some size threshold. For
|
||||
example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
|
||||
component monitors the size of each RADOS Pool, and adjusts the number of Placement
|
||||
Groups (Ceph's shard equivalent).
|
||||
|
||||
## Requirements
|
||||
|
||||
- A configurable capacity limit per-shard is enforced.
|
||||
- Changes in shard count do not interrupt service beyond requiring postgres
|
||||
to reconnect (i.e. milliseconds).
|
||||
- Human being does not have to choose shard count
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Shard splitting is always a tenant-global operation: we will not enable splitting
|
||||
one shard while leaving others intact.
|
||||
- The inverse operation (shard merging) is not described in this RFC. This is a lower
|
||||
priority than splitting, because databases grow more often than they shrink, and
|
||||
a database with many shards will still work properly if the stored data shrinks, just
|
||||
with slightly more overhead (e.g. redundant WAL replication)
|
||||
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
|
||||
a tenant based on load will make sense for some medium-capacity, high-load workloads,
|
||||
but is more complex to reason about and likely is not desirable until we have
|
||||
shard merging to reduce the shard count again if the database becomes less busy.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, storage controller
|
||||
|
||||
(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
|
||||
|
||||
## Terminology
|
||||
|
||||
**Parent** shards are the shards that exist before a split. **Child** shards are
|
||||
the new shards created during a split.
|
||||
|
||||
**Shard** is synonymous with _tenant shard_.
|
||||
|
||||
**Shard Index** is the 2-tuple of shard number and shard count, written in
|
||||
paths as {:02x}{:02x}, e.g. `0001`.
|
||||
|
||||
## Background
|
||||
|
||||
In the implementation section, a couple of existing aspects of sharding are important
|
||||
to remember:
|
||||
|
||||
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
|
||||
a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
|
||||
storage paths, and remote index metadata.
|
||||
- Remote layer file paths contain the shard index of the shard that created them, and
|
||||
remote indices contain the same index to enable building the layer file path. A shard's
|
||||
index may reference layers that were created by another shard.
|
||||
- Local tenant shard directories include the shard index. All layers downloaded by
|
||||
a tenant shard are stored in this shard-prefixed path, even if those layers were
|
||||
initially created by another shard: tenant shards do not read and write one anothers'
|
||||
paths.
|
||||
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
|
||||
This is for historical reasons and will be cleaned up in future, but the existing
|
||||
name is used here to help comprehension when reading code.
|
||||
|
||||
## Implementation
|
||||
|
||||
Note: this section focuses on the correctness of the core split process. This will
|
||||
be fairly inefficient in a naive implementation, and several important optimizations
|
||||
are described in a later section.
|
||||
|
||||
There are broadly two parts to the implementation:
|
||||
|
||||
1. The pageserver split API, which splits one shard on one pageserver
|
||||
2. The overall tenant split proccess which is coordinated by the storage controller,
|
||||
and calls into the pageserver split API as needed.
|
||||
|
||||
### Pageserver Split API
|
||||
|
||||
The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
|
||||
that takes the new total shard count in the body.
|
||||
|
||||
The pageserver split API operates on one tenant shard, on one pageserver. External
|
||||
coordination is required to use it safely, this is described in the later
|
||||
'Split procedure' section.
|
||||
|
||||
#### Preparation
|
||||
|
||||
First identify the shard indices for the new child shards. These are deterministic,
|
||||
calculated from the parent shard's index, and the number of children being created (this
|
||||
is an input to the API, and validated to be a power of two). In a trivial example, splitting
|
||||
0001 in two always results in 0002 and 0102.
|
||||
|
||||
Child shard indices are chosen such that the childrens' parts of the keyspace will
|
||||
be subsets of the parent's parts of the keyspace.
|
||||
|
||||
#### Step 1: write new remote indices
|
||||
|
||||
In remote storage, splitting is very simple: we may just write new index_part.json
|
||||
objects for each child shard, containing exactly the same layers as the parent shard.
|
||||
|
||||
The children will have more data than they need, but this avoids any exhausive
|
||||
re-writing or copying of layer files.
|
||||
|
||||
The index key path includes a generation number: the parent shard's current
|
||||
attached generation number will also be used for the child shards' indices. This
|
||||
makes the operation safely retryable: if everything crashes and restarts, we may
|
||||
call the split API again on the parent shard, and the result will be some new remote
|
||||
indices for the child shards, under a higher generation number.
|
||||
|
||||
#### Step 2: start new `Tenant` objects
|
||||
|
||||
A new `Tenant` object may be instantiated for each child shard, while the parent
|
||||
shard still exists. When calling the tenant_spawn function for this object,
|
||||
the remote index from step 1 will be read, and the child shard will start
|
||||
to ingest WAL to catch up from whatever was in the remote storage at step 1.
|
||||
|
||||
We now wait for child shards' WAL ingestion to catch up with the parent shard,
|
||||
so that we can safely tear down the parent shard without risking an availability
|
||||
gap to clients reading recent LSNs.
|
||||
|
||||
#### Step 3: tear down parent `Tenant` object
|
||||
|
||||
Once child shards are running and have caught up with WAL ingest, we no longer
|
||||
need the parent shard. Note that clients may still be using it -- when we
|
||||
shut it down, any page_service handlers will also shut down, causing clients
|
||||
to disconnect. When the client reconnects, it will re-lookup the tenant,
|
||||
and hit the child shard instead of the parent (shard lookup from page_service
|
||||
should bias toward higher ShardCount shards).
|
||||
|
||||
Note that at this stage the page service client has not yet been notified of
|
||||
any split. In the trivial single split example:
|
||||
|
||||
- Shard 0001 is gone: Tenant object torn down
|
||||
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
|
||||
- Clients will continue to connect to that server thinking that shard 0001 is there,
|
||||
and all requests will work, because any key that was in shard 0001 is definitely
|
||||
available in either shard 0002 or shard 0102.
|
||||
- Eventually, the storage controller (not the pageserver) will decide to migrate
|
||||
some child shards away: at that point it will do a live migration, ensuring
|
||||
that the client has an updated configuration before it detaches anything
|
||||
from the original server.
|
||||
|
||||
#### Complete
|
||||
|
||||
When we send a 200 response to the split request, we are promising the caller:
|
||||
|
||||
- That the child shards are persistent in remote storage
|
||||
- That the parent shard has been shut down
|
||||
|
||||
This enables the caller to proceed with the overall shard split operation, which
|
||||
may involve other shards on other pageservers.
|
||||
|
||||
### Storage Controller Split procedure
|
||||
|
||||
Splitting a tenant requires calling the pageserver split API, and tracking
|
||||
enough state to ensure recovery + completion in the event of any component (pageserver
|
||||
or storage controller) crashing (or request timing out) during the split.
|
||||
|
||||
1. call the split API on all existing shards. Ensure that the resulting
|
||||
child shards are pinned to their pageservers until _all_ the split calls are done.
|
||||
This pinning may be implemented as a "split bit" on the tenant shards, that
|
||||
blocks any migrations, and also acts as a sign that if we restart, we must go
|
||||
through some recovery steps to resume the split.
|
||||
2. Once all the split calls are done, we may unpin the child shards (clear
|
||||
the split bit). The split is now complete: subsequent steps are just migrations,
|
||||
not strictly part of the split.
|
||||
3. Try to schedule new pageserver locations for the child shards, using
|
||||
a soft anti-affinity constraint to place shards from the same tenant onto different
|
||||
pageservers.
|
||||
|
||||
Updating computes about the new shard count is not necessary until we migrate
|
||||
any of the child shards away from the parent's location.
|
||||
|
||||
### Recovering from failures
|
||||
|
||||
#### Rolling back an incomplete split
|
||||
|
||||
An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
|
||||
and detaching child shards. This will lose any WAL ingested into the children after the parents
|
||||
were detached earlier, but the parents will catch up.
|
||||
|
||||
No special pageserver API is needed for this. From the storage controllers point of view, the
|
||||
procedure is:
|
||||
|
||||
1. For all parent shards in the tenant, ensure they are attached
|
||||
2. For all child shards, ensure they are not attached
|
||||
3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
|
||||
|
||||
Any remote storage content for child shards is left behind. This is similar to other cases where
|
||||
we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
|
||||
index that references it). Future online scrub/cleanup functionality can remove these objects, or
|
||||
they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
|
||||
which would include any child shards that were rolled back.
|
||||
|
||||
If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
|
||||
this, we will **block timeline creation during splitting**, so that we can safely roll back until
|
||||
the split is complete, without risking losing timelines.
|
||||
|
||||
Rolling back an incomplete split will happen automatically if a split fails due to some fatal
|
||||
reason, and will not be accessible via an API:
|
||||
|
||||
- A pageserver fails to complete its split API request after too many retries
|
||||
- A pageserver returns a fatal unexpected error such as 400 or 500
|
||||
- The storage controller database returns a non-retryable error
|
||||
- Some internal invariant is violated in the storage controller split code
|
||||
|
||||
#### Rolling back a complete split
|
||||
|
||||
A complete shard split may be rolled back similarly to an incomplete split, with the following
|
||||
modifications:
|
||||
|
||||
- The parent shards will no longer exist in the storage controller database, so these must
|
||||
be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
|
||||
may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
|
||||
shards in the storage controller database.
|
||||
- Any timelines that were created after the split complete will disappear when rolling back
|
||||
to the tenant shards. For this reason, rolling back after a complete split should only
|
||||
be done due to serious issues where loss of recently created timelines is acceptable, or
|
||||
in cases where we have confirmed that no timelines were created in the intervening period.
|
||||
- Parent shards' layers must not have been deleted: this property will come "for free" when
|
||||
we first roll out sharding, by simply not implementing deletion of parent layers after
|
||||
a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
|
||||
Optimizations section), it should apply a TTL to layers such that we have a
|
||||
defined walltime window in which rollback will be possible.
|
||||
|
||||
The storage controller will expose an API for rolling back a complete split, for use
|
||||
in the field if we encounter some critical bug with a post-split tenant.
|
||||
|
||||
#### Retrying API calls during Pageserver Restart
|
||||
|
||||
When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
|
||||
child shards from an ongoing split. This does not intrinsically break anything, and the
|
||||
pageserver may include all these shards in its `/re-attach` request to the storage controller.
|
||||
|
||||
In order to support such restarts, it is important that the storage controller stores
|
||||
persistent records of each child shard before it calls into a pageserver, as these child shards
|
||||
may require generation increments via a `/re-attach` request.
|
||||
|
||||
The pageserver restart will also result in a failed API call from the storage controller's point
|
||||
of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
|
||||
complete, and all shards must remain pinned to their current pageserver locations until the
|
||||
split is done.
|
||||
|
||||
The pageserver API calls during splitting will retry on transient errors, so that
|
||||
short availability gaps do not result in a failure of the overall operation. The
|
||||
split in progress will be automatically rolled back if the threshold for API
|
||||
retries is reached (e.g. if a pageserver stays offline for longer than a typical
|
||||
restart).
|
||||
|
||||
#### Rollback on Storage Controller Restart
|
||||
|
||||
On startup, the storage controller will inspect the split bit for tenant shards that
|
||||
it loads from the database. If any splits are in progress:
|
||||
|
||||
- Database content will be reverted to the parent shards
|
||||
- Child shards will be dropped from memory
|
||||
- The parent and child shards will be included in the general startup reconciliation that
|
||||
the storage controller does: any child shards will be detached from pageservers because
|
||||
they don't exist in the storage controller's expected set of shards, and parent shards
|
||||
will be attached if they aren't already.
|
||||
|
||||
#### Storage controller API request failures/retries
|
||||
|
||||
The split request handler will implement idempotency: if the [`Tenant`] requested to split
|
||||
doesn't exist, we will check for the would-be child shards, and if they already exist,
|
||||
we consider the request complete.
|
||||
|
||||
If a request is retried while the original request is still underway, then the split
|
||||
request handler will notice an InProgress marker in TenantManager, and return 503
|
||||
to encourage the client to backoff/retry. This is the same as the general pageserver
|
||||
API handling for calls that try to act on an InProgress shard.
|
||||
|
||||
#### Compute start/restart during a split
|
||||
|
||||
If a compute starts up during split, it will be configured with the old sharding
|
||||
configuration. This will work for reads irrespective of the progress of the split
|
||||
as long as no child hards have been migrated away from their original location, and
|
||||
this is guaranteed in the split procedure (see earlier section).
|
||||
|
||||
#### Pageserver fails permanently during a split
|
||||
|
||||
If a pageserver permanently fails (i.e. the storage controller availability state for it
|
||||
goes to Offline) while a split is in progress, the splitting operation will roll back, and
|
||||
during the roll back it will skip any API calls to the offline pageserver. If the offline
|
||||
pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
|
||||
|
||||
### Handling secondary locations
|
||||
|
||||
For correctness, it is not necessary to split secondary locations. We can simply detach
|
||||
the secondary locations for parent shards, and then attach new secondary locations
|
||||
for child shards.
|
||||
|
||||
Clearly this is not optimal, as it will result in re-downloads of layer files that
|
||||
were already present on disk. See "Splitting secondary locations"
|
||||
|
||||
### Conditions to trigger a split
|
||||
|
||||
The pageserver will expose a new API for reporting on shards that are candidates
|
||||
for split: this will return a top-N report of the largest tenant shards by
|
||||
physical size (remote size). This should exclude any tenants that are already
|
||||
at the maximum configured shard count.
|
||||
|
||||
The API would look something like:
|
||||
`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
|
||||
|
||||
The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
|
||||
|
||||
A split operation will be started when the tenant exceeds some threshold. This threshold
|
||||
should be _less than_ how large we actually want shards to be, perhaps much less. That's to
|
||||
minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
|
||||
wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
|
||||
tenant size distribution may be useful here: if we can make a statement like "usually, if
|
||||
a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
|
||||
make our policy to split a tenant at 20GiB.
|
||||
|
||||
The finest split we can do is by factors of two, but we can do higher-cardinality splits
|
||||
too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
|
||||
as it grows. An example of a very simple heuristic for early deployment of the splitting
|
||||
feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
|
||||
would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
|
||||
split a tenant, it will not need re-splitting soon after.
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Flush parent shard to remote storage during split
|
||||
|
||||
Any data that is in WAL but not remote storage at time of split will need
|
||||
to be replayed by child shards when they start for the first time. To minimize
|
||||
this work, we may flush the parent shard to remote storage before writing the
|
||||
remote indices for child shards.
|
||||
|
||||
It is important that this flush is subject to some time bounds: we may be splitting
|
||||
in response to a surge of write ingest, so it may be time-critical to split. A
|
||||
few seconds to flush latest data should be sufficient to optimize common cases without
|
||||
running the risk of holding up a split for a harmful length of time when a parent
|
||||
shard is being written heavily. If the flush doesn't complete in time, we may proceed
|
||||
to shut down the parent shard and carry on with the split.
|
||||
|
||||
### Hard linking parent layers into child shard directories
|
||||
|
||||
Before we start the Tenant objects for child shards, we may pre-populate their
|
||||
local storage directories with hard links to the layer files already present
|
||||
in the parent shard's local directory. When the child shard starts and downloads
|
||||
its remote index, it will find all those layer files already present on local disk.
|
||||
|
||||
This avoids wasting download capacity and makes splitting faster, but more importantly
|
||||
it avoids taking up a factor of N more disk space when splitting 1 shard into N.
|
||||
|
||||
This mechanism will work well in typical flows where shards are migrated away
|
||||
promptly after a split, but for the general case including what happens when
|
||||
layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
|
||||
section below.
|
||||
|
||||
### Filtering during compaction
|
||||
|
||||
Compaction, especially image layer generation, should skip any keys that are
|
||||
present in a shard's layer files, but do not match the shard's ShardIdentity's
|
||||
is_key_local() check. This avoids carrying around data for longer than necessary
|
||||
in post-split compactions.
|
||||
|
||||
This was already implemented in https://github.com/neondatabase/neon/pull/6246
|
||||
|
||||
### Proactive compaction
|
||||
|
||||
In remote storage, there is little reason to rewrite any data on a shard split:
|
||||
all the children can reference parent layers via the very cheap write of the child
|
||||
index_part.json.
|
||||
|
||||
In local storage, things are more nuanced. During the initial split there is no
|
||||
capacity cost to duplicating parent layers, if we implement the hard linking
|
||||
optimization described above. However, as soon as any layers are evicted from
|
||||
local disk and re-downloaded, the downloaded layers will not be hard-links any more:
|
||||
they'll have real capacity footprint. That isn't a problem if we migrate child shards
|
||||
away from the parent node swiftly, but it risks a significant over-use of local disk
|
||||
space if we do not.
|
||||
|
||||
For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
|
||||
the shards elsewhere, then churned all the layers in all the shards via eviction,
|
||||
then we would blow up the storage capacity used on the node by 8x. If we're splitting
|
||||
a 100GB shard, that could take the pageserver to the point of exhausting disk space.
|
||||
|
||||
To avoid this scenario, we could implement a special compaction mode where we just
|
||||
read historic layers, drop unwanted keys, and write back the layer file. This
|
||||
is pretty expensive, but useful if we have split a large shard and are not going to
|
||||
migrate the child shards away.
|
||||
|
||||
The heuristic conditions for triggering such a compaction are:
|
||||
|
||||
- A) eviction plus time: if a child shard
|
||||
has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
|
||||
- B) resident size plus time: we may inspect the resident layers and calculate how
|
||||
many of them include the overhead of storing pre-split keys. After some time
|
||||
threshold (different to the one in case A) we still have such layers occupying
|
||||
local disk space, then we should proactively compact them.
|
||||
|
||||
### Cleaning up parent-shard layers
|
||||
|
||||
It is functionally harmless to leave parent shard layers in remote storage indefinitely.
|
||||
They would be cleaned up in the event of the tenant's deletion.
|
||||
|
||||
As an optimization to avoid leaking remote storage capacity (which costs money), we may
|
||||
lazily clean up parent shard layers once no child shards reference them.
|
||||
|
||||
This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
|
||||
|
||||
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
|
||||
which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
|
||||
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
|
||||
may drop out now.
|
||||
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
|
||||
- for all ancestral shards, list objects in the prefix and delete any layer which was not
|
||||
referenced by a current shard.
|
||||
|
||||
If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
|
||||
|
||||
The cleanup may be done by the scrubber (external process), or we may choose to have
|
||||
the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
|
||||
reading the other shard's indices at runtime, and we do not require visibility of the
|
||||
latest index writes.
|
||||
|
||||
Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
|
||||
that we retain the option to roll back a split in case of bugs.
|
||||
|
||||
### Splitting secondary locations
|
||||
|
||||
We may implement a pageserver API similar to the main splitting API, which does a simpler
|
||||
operation for secondary locations: it would not write anything to S3, instead it would simply
|
||||
create the child shard directory on local disk, hard link in directories from the parent,
|
||||
and set up the in memory (TenantSlot) state for the children.
|
||||
|
||||
Similar to attached locations, a subset of secondary locations will probably need re-locating
|
||||
after the split is complete, to avoid leaving multiple child shards on the same pageservers,
|
||||
where they may use excessive space for the tenant.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### What should the thresholds be set to?
|
||||
|
||||
Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
|
||||
|
||||
Max shard count:
|
||||
|
||||
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
|
||||
the un-filtered WAL is sent to all shards. To avoid this growing out of control,
|
||||
a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
|
||||
on the safekeeper.
|
||||
- there is also little benefit to increasing the shard count beyond the number
|
||||
of pageservers in a region.
|
||||
|
||||
### Is it worth just rewriting all the data during a split to simplify reasoning about space?
|
||||
@@ -29,6 +29,7 @@ pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
23
libs/metrics/src/metric_vec_duration.rs
Normal file
23
libs/metrics/src/metric_vec_duration.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -6,10 +6,7 @@ use std::str::FromStr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use crate::{models::ShardParameters, shard::TenantShardId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
@@ -38,7 +35,7 @@ pub struct NodeRegisterRequest {
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailabilityWrapper>,
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
@@ -60,31 +57,6 @@ pub struct TenantLocateResponse {
|
||||
pub shard_params: ShardParameters,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponse {
|
||||
pub shards: Vec<TenantDescribeResponseShard>,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
pub policy: PlacementPolicy,
|
||||
pub config: TenantConfig,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantDescribeResponseShard {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
|
||||
pub node_attached: Option<NodeId>,
|
||||
pub node_secondary: Vec<NodeId>,
|
||||
|
||||
pub last_error: String,
|
||||
|
||||
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
|
||||
pub is_reconciling: bool,
|
||||
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
|
||||
pub is_pending_compute_notification: bool,
|
||||
/// A shard split is currently underway
|
||||
pub is_splitting: bool,
|
||||
}
|
||||
|
||||
/// Explicitly migrating a particular shard is a low level operation
|
||||
/// TODO: higher level "Reschedule tenant" operation where the request
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
@@ -94,82 +66,30 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active(UtilizationScore),
|
||||
Active,
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl PartialEq for NodeAvailability {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
use NodeAvailability::*;
|
||||
matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NodeAvailability {}
|
||||
|
||||
// This wrapper provides serde functionality and it should only be used to
|
||||
// communicate with external callers which don't know or care about the
|
||||
// utilisation score of the pageserver it is targeting.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum NodeAvailabilityWrapper {
|
||||
Active,
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
fn from(val: NodeAvailabilityWrapper) -> Self {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
fn from(val: NodeAvailability) -> Self {
|
||||
match val {
|
||||
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
|
||||
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
// This is used when parsing node configuration requests from neon-local.
|
||||
// Assume the worst possible utilisation score
|
||||
// and let it get updated via the heartbeats.
|
||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||
"active" => Ok(Self::Active),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
@@ -205,42 +125,5 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
}
|
||||
}
|
||||
|
||||
/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
|
||||
/// to create secondary locations.
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
pub enum PlacementPolicy {
|
||||
/// Normal live state: one attached pageserver and zero or more secondaries.
|
||||
Attached(usize),
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
|
||||
/// Do not attach to any pageservers. This is appropriate for tenants that
|
||||
/// have been idle for a long time, where we do not mind some delay in making
|
||||
/// them available in future.
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use serde_json;
|
||||
|
||||
/// Check stability of PlacementPolicy's serialization
|
||||
#[test]
|
||||
fn placement_policy_encoding() -> anyhow::Result<()> {
|
||||
let v = PlacementPolicy::Attached(1);
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "{\"Attached\":1}");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
|
||||
let v = PlacementPolicy::Detached;
|
||||
let encoded = serde_json::to_string(&v)?;
|
||||
assert_eq!(encoded, "\"Detached\"");
|
||||
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ pub mod utilization;
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
@@ -22,7 +21,6 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::controller_api::PlacementPolicy;
|
||||
use crate::{
|
||||
reltag::RelTag,
|
||||
shard::{ShardCount, ShardStripeSize, TenantShardId},
|
||||
@@ -199,13 +197,6 @@ pub struct TimelineCreateRequest {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitRequest {
|
||||
pub new_shard_count: u8,
|
||||
|
||||
// A tenant's stripe size is only meaningful the first time their shard count goes
|
||||
// above 1: therefore during a split from 1->N shards, we may modify the stripe size.
|
||||
//
|
||||
// If this is set while the stripe count is being increased from an already >1 value,
|
||||
// then the request will fail with 400.
|
||||
pub new_stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -251,11 +242,6 @@ pub struct TenantCreateRequest {
|
||||
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
|
||||
pub shard_parameters: ShardParameters,
|
||||
|
||||
// This parameter is only meaningful in requests sent to the storage controller
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub placement_policy: Option<PlacementPolicy>,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -427,7 +413,7 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
pub tenant_id: TenantShardId,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -449,8 +435,6 @@ pub struct TenantShardLocation {
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigResponse {
|
||||
pub shards: Vec<TenantShardLocation>,
|
||||
// If the shards' ShardCount count is >1, stripe_size will be set.
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -578,7 +562,7 @@ pub struct TimelineInfo {
|
||||
pub walreceiver_status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct LayerMapInfo {
|
||||
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||
@@ -596,7 +580,7 @@ pub enum LayerAccessKind {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStatFullDetails {
|
||||
pub when_millis_since_epoch: u64,
|
||||
pub task_kind: Cow<'static, str>,
|
||||
pub task_kind: &'static str,
|
||||
pub access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
@@ -655,23 +639,23 @@ impl LayerResidenceEvent {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct LayerAccessStats {
|
||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
||||
pub task_kind_access_flag: Vec<&'static str>,
|
||||
pub first: Option<LayerAccessStatFullDetails>,
|
||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum InMemoryLayerInfo {
|
||||
Open { lsn_start: Lsn },
|
||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum HistoricLayerInfo {
|
||||
Delta {
|
||||
@@ -693,32 +677,6 @@ pub enum HistoricLayerInfo {
|
||||
},
|
||||
}
|
||||
|
||||
impl HistoricLayerInfo {
|
||||
pub fn layer_file_name(&self) -> &str {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
}
|
||||
}
|
||||
pub fn is_remote(&self) -> bool {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => *remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => *remote,
|
||||
}
|
||||
}
|
||||
pub fn set_remote(&mut self, value: bool) {
|
||||
let field = match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => remote,
|
||||
};
|
||||
*field = value;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
@@ -751,52 +709,6 @@ pub struct WalRedoManagerStatus {
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||
/// what's happening.
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct SecondaryProgress {
|
||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||
#[serde(
|
||||
serialize_with = "opt_ser_rfc3339_millis",
|
||||
deserialize_with = "opt_deser_rfc3339_millis"
|
||||
)]
|
||||
pub heatmap_mtime: Option<SystemTime>,
|
||||
|
||||
/// The number of layers currently on-disk
|
||||
pub layers_downloaded: usize,
|
||||
/// The number of layers in the most recently seen heatmap
|
||||
pub layers_total: usize,
|
||||
|
||||
/// The number of layer bytes currently on-disk
|
||||
pub bytes_downloaded: u64,
|
||||
/// The number of layer bytes in the most recently seen heatmap
|
||||
pub bytes_total: u64,
|
||||
}
|
||||
|
||||
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &Option<SystemTime>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
match ts {
|
||||
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
|
||||
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
match s {
|
||||
None => Ok(None),
|
||||
Some(s) => humantime::parse_rfc3339(&s)
|
||||
.map_err(serde::de::Error::custom)
|
||||
.map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
#[derive(
|
||||
Copy,
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::time::SystemTime;
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
@@ -21,10 +21,7 @@ pub struct PageserverUtilization {
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(
|
||||
serialize_with = "ser_rfc3339_millis",
|
||||
deserialize_with = "deser_rfc3339_millis"
|
||||
)]
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
@@ -35,14 +32,6 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
|
||||
@@ -6,18 +6,11 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
use crate::shard::TenantShardId;
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
/// Optional inline self-registration: this is useful with the storage controller,
|
||||
/// if the node already has a node_id set.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use futures::pin_mut;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io::ErrorKind;
|
||||
use std::net::SocketAddr;
|
||||
@@ -377,7 +378,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
&mut self,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let flush_fut = std::pin::pin!(self.flush());
|
||||
let flush_fut = self.flush();
|
||||
pin_mut!(flush_fut);
|
||||
flush_fut.poll(cx)
|
||||
}
|
||||
|
||||
|
||||
@@ -72,19 +72,14 @@ async fn simple_select() {
|
||||
}
|
||||
}
|
||||
|
||||
static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
|
||||
static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("key.pem"));
|
||||
let key = rustls_pemfile::rsa_private_keys(&mut cursor)
|
||||
.next()
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
rustls::pki_types::PrivateKeyDer::Pkcs1(key)
|
||||
rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
|
||||
static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
|
||||
let mut cursor = Cursor::new(include_bytes!("cert.pem"));
|
||||
let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
|
||||
cert
|
||||
rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
|
||||
});
|
||||
|
||||
// test that basic select with ssl works
|
||||
@@ -93,8 +88,9 @@ async fn simple_select_ssl() {
|
||||
let (client_sock, server_sock) = make_tcp_pair().await;
|
||||
|
||||
let server_cfg = rustls::ServerConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(vec![CERT.clone()], KEY.clone_key())
|
||||
.with_single_cert(vec![CERT.clone()], KEY.clone())
|
||||
.unwrap();
|
||||
let tls_config = Some(Arc::new(server_cfg));
|
||||
let pgbackend =
|
||||
@@ -106,9 +102,10 @@ async fn simple_select_ssl() {
|
||||
});
|
||||
|
||||
let client_cfg = rustls::ClientConfig::builder()
|
||||
.with_safe_defaults()
|
||||
.with_root_certificates({
|
||||
let mut store = rustls::RootCertStore::empty();
|
||||
store.add(CERT.clone()).unwrap();
|
||||
store.add(&CERT).unwrap();
|
||||
store
|
||||
})
|
||||
.with_no_client_auth();
|
||||
|
||||
@@ -18,7 +18,6 @@ camino.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
|
||||
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(part.blob.properties.etag);
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
|
||||
if bufs.is_empty() {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Azure GET response contained no buffers"
|
||||
)));
|
||||
}
|
||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||
let etag = etag.unwrap();
|
||||
let last_modified = last_modified.unwrap();
|
||||
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
|
||||
@@ -42,9 +42,6 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||
pub use azure_core::Etag;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
@@ -294,9 +291,9 @@ pub type DownloadStream =
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: SystemTime,
|
||||
pub last_modified: Option<SystemTime>,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Etag,
|
||||
pub etag: Option<String>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::{
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -30,7 +30,6 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
use crate::Etag;
|
||||
|
||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
@@ -407,37 +406,35 @@ impl RemoteStorage for LocalFs {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -455,51 +452,50 @@ impl RemoteStorage for LocalFs {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||
}
|
||||
}
|
||||
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
@@ -614,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
|
||||
tokio::fs::metadata(&file_path).await.map_err(|e| {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
DownloadError::NotFound
|
||||
} else {
|
||||
DownloadError::BadInput(e.into())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we
|
||||
// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
|
||||
// quickly, with less overhead than using a mock S3 server.
|
||||
fn mock_etag(meta: &std::fs::Metadata) -> Etag {
|
||||
let mtime = meta.modified().expect("Filesystem mtime missing");
|
||||
format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
|
||||
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
if file_path.exists() {
|
||||
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
@@ -287,17 +287,8 @@ impl S3Bucket {
|
||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output
|
||||
.e_tag
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
|
||||
.into();
|
||||
let last_modified = object_output
|
||||
.last_modified
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!(
|
||||
"Missing LastModified header"
|
||||
)))?
|
||||
.try_into()
|
||||
.map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
|
||||
let etag = object_output.e_tag;
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
|
||||
@@ -17,7 +17,6 @@ use remote_storage::{
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
@@ -118,7 +117,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||
let last_modified = dl.last_modified;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
@@ -485,33 +484,32 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let stream = ctx
|
||||
let mut stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
|
||||
let first = reader.fill_buf().await.expect("should have the first blob");
|
||||
|
||||
let len = first.len();
|
||||
tracing::info!(len, "downloaded first chunk");
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < file_len,
|
||||
first.len() < len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
reader.consume(len);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = reader.fill_buf().await;
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
@@ -522,10 +520,6 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
let e = DownloadError::from(e);
|
||||
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
@@ -37,7 +36,6 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
@@ -48,7 +46,6 @@ strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
walkdir.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
#[derive(serde::Serialize)]
|
||||
struct SerdeRepr<T> {
|
||||
buffer: Vec<T>,
|
||||
buffer_size: usize,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
@@ -62,7 +61,6 @@ where
|
||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||
SerdeRepr {
|
||||
buffer: buffer.iter().cloned().collect(),
|
||||
buffer_size: L,
|
||||
drop_count: *drop_count,
|
||||
}
|
||||
}
|
||||
@@ -80,52 +78,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Deserialize<'de>,
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let SerdeRepr {
|
||||
buffer: des_buffer,
|
||||
drop_count,
|
||||
buffer_size,
|
||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||
if buffer_size != L {
|
||||
use serde::de::Error;
|
||||
return Err(D::Error::custom(format!(
|
||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||
)));
|
||||
}
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(des_buffer);
|
||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::HistoryBufferWithDropCounter;
|
||||
|
||||
#[test]
|
||||
fn test_basics() {
|
||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert!(b.iter().any(|e| *e == 2));
|
||||
assert!(b.iter().any(|e| *e == 3));
|
||||
assert!(!b.iter().any(|e| *e == 1));
|
||||
|
||||
// round-trip serde
|
||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||
b.iter().cloned().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -87,8 +87,6 @@ pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
pub mod zstd;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
|
||||
// Serialize with RFC3339 format.
|
||||
#[serde(with = "serde_systemtime")]
|
||||
pub replytime: SystemTime,
|
||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||
pub shard_number: u32,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
@@ -41,7 +43,6 @@ impl PageserverFeedback {
|
||||
remote_consistent_lsn: Lsn::INVALID,
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
shard_number: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,26 +59,17 @@ impl PageserverFeedback {
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
let buf_ptr = buf.len();
|
||||
buf.put_u8(0); // # of keys, will be filled later
|
||||
let mut nkeys = 0;
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.last_received_lsn.0);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.disk_consistent_lsn.0);
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.remote_consistent_lsn.0);
|
||||
@@ -88,19 +80,9 @@ impl PageserverFeedback {
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
|
||||
if self.shard_number > 0 {
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"shard_number\0");
|
||||
buf.put_i32(4);
|
||||
buf.put_u32(self.shard_number);
|
||||
}
|
||||
|
||||
buf[buf_ptr] = nkeys;
|
||||
}
|
||||
|
||||
// Deserialize PageserverFeedback message
|
||||
@@ -141,11 +123,6 @@ impl PageserverFeedback {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
b"shard_number" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 4);
|
||||
rf.shard_number = buf.get_u32();
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
@@ -217,7 +194,10 @@ mod tests {
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
data[0] += 1;
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data.put_slice(b"new_field_one\0");
|
||||
data.put_i32(8);
|
||||
data.put_u64(42);
|
||||
|
||||
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
||||
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
||||
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
||||
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
||||
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
||||
// MutexGuard over an await point.
|
||||
loop {
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
{
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire().await
|
||||
};
|
||||
|
||||
let Ok(permit) = permit else {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||
// there was a take_and_deinit in between
|
||||
continue;
|
||||
}
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
};
|
||||
|
||||
permit.forget();
|
||||
}
|
||||
|
||||
let permit = InitPermit(sem);
|
||||
return Err(permit);
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
@@ -245,7 +202,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let sem = swapped.init_semaphore.clone();
|
||||
// acquire and forget right away, moving the control over to InitPermit
|
||||
@@ -524,39 +481,4 @@ mod tests {
|
||||
|
||||
assert_eq!("t1", *cell.get().unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn detached_init_smoke() {
|
||||
let target = OnceCell::default();
|
||||
|
||||
let Err(permit) = target.get_or_init_detached().await else {
|
||||
unreachable!("it is not initialized")
|
||||
};
|
||||
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
||||
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
||||
)
|
||||
.await
|
||||
.expect_err("should timeout since we are already holding the permit");
|
||||
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(*guard, 42);
|
||||
|
||||
guard.take_and_deinit()
|
||||
};
|
||||
|
||||
assert!(target.get().is_none());
|
||||
|
||||
target.set(11, permit);
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,60 +1,27 @@
|
||||
use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum VecMapOrdering {
|
||||
Greater,
|
||||
GreaterOrEqual,
|
||||
}
|
||||
|
||||
/// Ordered map datastructure implemented in a Vec.
|
||||
/// Append only - can only add keys that are larger than the
|
||||
/// current max key.
|
||||
/// Ordering can be adjusted using [`VecMapOrdering`]
|
||||
/// during `VecMap` construction.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VecMap<K, V> {
|
||||
data: Vec<(K, V)>,
|
||||
ordering: VecMapOrdering,
|
||||
}
|
||||
pub struct VecMap<K, V>(Vec<(K, V)>);
|
||||
|
||||
impl<K, V> Default for VecMap<K, V> {
|
||||
fn default() -> Self {
|
||||
VecMap {
|
||||
data: Default::default(),
|
||||
ordering: VecMapOrdering::Greater,
|
||||
}
|
||||
VecMap(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum VecMapError {
|
||||
#[error("Key violates ordering constraint")]
|
||||
InvalidKey,
|
||||
#[error("Mismatched ordering constraints")]
|
||||
ExtendOrderingError,
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct InvalidKey;
|
||||
|
||||
impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn new(ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::new(),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
|
||||
Self {
|
||||
data: Vec::with_capacity(capacity),
|
||||
ordering,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.is_empty()
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[(K, V)] {
|
||||
self.data.as_slice()
|
||||
self.0.as_slice()
|
||||
}
|
||||
|
||||
/// This function may panic if given a range where the lower bound is
|
||||
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
|
||||
use std::ops::Bound::*;
|
||||
|
||||
let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
|
||||
let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
|
||||
|
||||
let start_idx = match range.start_bound() {
|
||||
Unbounded => 0,
|
||||
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
};
|
||||
|
||||
let end_idx = match range.end_bound() {
|
||||
Unbounded => self.data.len(),
|
||||
Unbounded => self.0.len(),
|
||||
Included(k) => match binary_search(k) {
|
||||
Ok(idx) => idx + 1,
|
||||
Err(idx) => idx,
|
||||
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
|
||||
};
|
||||
|
||||
&self.data[start_idx..end_idx]
|
||||
&self.0[start_idx..end_idx]
|
||||
}
|
||||
|
||||
/// Add a key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering the
|
||||
/// pair will not be added and `InvalidKey` error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
|
||||
self.validate_key_order(&key)?;
|
||||
/// If `key` is less than or equal to the current maximum key
|
||||
/// the pair will not be added and InvalidKey error will be returned.
|
||||
pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
|
||||
if let Some((last_key, _last_value)) = self.0.last() {
|
||||
if &key <= last_key {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Update the maximum key value pair or add a new key value pair to the map.
|
||||
/// If `key` is not respective of the `self` ordering no updates or additions
|
||||
/// will occur and `InvalidKey` error will be returned.
|
||||
/// If `key` is less than the current maximum key no updates or additions
|
||||
/// will occur and InvalidKey error will be returned.
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
key: K,
|
||||
mut value: V,
|
||||
) -> Result<(Option<V>, usize), VecMapError> {
|
||||
if let Some((last_key, last_value)) = self.data.last_mut() {
|
||||
) -> Result<(Option<V>, usize), InvalidKey> {
|
||||
if let Some((last_key, last_value)) = self.0.last_mut() {
|
||||
match key.cmp(last_key) {
|
||||
Ordering::Less => return Err(VecMapError::InvalidKey),
|
||||
Ordering::Less => return Err(InvalidKey),
|
||||
Ordering::Equal => {
|
||||
std::mem::swap(last_value, &mut value);
|
||||
const DELTA_SIZE: usize = 0;
|
||||
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
V: Clone,
|
||||
{
|
||||
let split_idx = self
|
||||
.data
|
||||
.0
|
||||
.binary_search_by_key(&cutoff, extract_key)
|
||||
.unwrap_or_else(std::convert::identity);
|
||||
|
||||
(
|
||||
VecMap {
|
||||
data: self.data[..split_idx].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap {
|
||||
data: self.data[split_idx..].to_vec(),
|
||||
ordering: self.ordering,
|
||||
},
|
||||
VecMap(self.0[..split_idx].to_vec()),
|
||||
VecMap(self.0[split_idx..].to_vec()),
|
||||
)
|
||||
}
|
||||
|
||||
/// Move items from `other` to the end of `self`, leaving `other` empty.
|
||||
/// If the `other` ordering is different from `self` ordering
|
||||
/// `ExtendOrderingError` error will be returned.
|
||||
/// If any keys in `other` is not respective of the ordering defined in
|
||||
/// `self`, `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
|
||||
if self.ordering != other.ordering {
|
||||
return Err(VecMapError::ExtendOrderingError);
|
||||
}
|
||||
/// If any keys in `other` is less than or equal to any key in `self`,
|
||||
/// `InvalidKey` error will be returned and no mutation will occur.
|
||||
pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
|
||||
let self_last_opt = self.0.last().map(extract_key);
|
||||
let other_first_opt = other.0.last().map(extract_key);
|
||||
|
||||
let other_first_opt = other.data.last().map(extract_key);
|
||||
if let Some(other_first) = other_first_opt {
|
||||
self.validate_key_order(other_first)?;
|
||||
}
|
||||
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Validate the current last key in `self` and key being
|
||||
/// inserted against the order defined in `self`.
|
||||
fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
|
||||
if let Some(last_key) = self.data.last().map(extract_key) {
|
||||
match (&self.ordering, &key.cmp(last_key)) {
|
||||
(VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::Greater, Ordering::Greater) => {}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
|
||||
return Err(VecMapError::InvalidKey);
|
||||
}
|
||||
(VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
|
||||
if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
|
||||
if self_last >= other_first {
|
||||
return Err(InvalidKey);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
|
||||
Ok(delta_size)
|
||||
}
|
||||
|
||||
/// Instrument an operation on the underlying [`Vec`].
|
||||
/// Will panic if the operation decreases capacity.
|
||||
/// Returns the increase in memory usage caused by the op.
|
||||
fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
|
||||
let old_cap = self.data.capacity();
|
||||
op(&mut self.data);
|
||||
let new_cap = self.data.capacity();
|
||||
let old_cap = self.0.capacity();
|
||||
op(&mut self.0);
|
||||
let new_cap = self.0.capacity();
|
||||
|
||||
match old_cap.cmp(&new_cap) {
|
||||
Ordering::Less => {
|
||||
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
|
||||
Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `from_iter` defined in `FromIter` trait except
|
||||
/// that it accepts an [`VecMapOrdering`]
|
||||
pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
|
||||
let iter = iter.into_iter();
|
||||
let initial_capacity = {
|
||||
match iter.size_hint() {
|
||||
(lower_bound, None) => lower_bound,
|
||||
(_, Some(upper_bound)) => upper_bound,
|
||||
}
|
||||
};
|
||||
|
||||
let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
|
||||
for (key, value) in iter {
|
||||
vec_map
|
||||
.append(key, value)
|
||||
.expect("The passed collection needs to be sorted!");
|
||||
}
|
||||
|
||||
vec_map
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Ord, V> IntoIterator for VecMap<K, V> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = std::vec::IntoIter<(K, V)>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.data.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
|
||||
mod tests {
|
||||
use std::{collections::BTreeMap, ops::Bound};
|
||||
|
||||
use super::{VecMap, VecMapOrdering};
|
||||
use super::VecMap;
|
||||
|
||||
#[test]
|
||||
fn unbounded_range() {
|
||||
@@ -396,59 +310,5 @@ mod tests {
|
||||
left.extend(&mut one_map).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(one_map.as_slice(), &[(1, ())]);
|
||||
|
||||
let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
map_greater_or_equal.append(2, ()).unwrap();
|
||||
|
||||
left.extend(&mut map_greater_or_equal).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
|
||||
assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extend_with_ordering() {
|
||||
let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
left.append(0, ()).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_right = VecMap::new(VecMapOrdering::Greater);
|
||||
greater_right.append(0, ()).unwrap();
|
||||
left.extend(&mut greater_right).unwrap_err();
|
||||
assert_eq!(left.as_slice(), &[(0, ())]);
|
||||
|
||||
let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
greater_or_equal_right.append(2, ()).unwrap();
|
||||
left.extend(&mut greater_or_equal_right).unwrap();
|
||||
assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vec_map_from_sorted() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
|
||||
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
|
||||
let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
assert_eq!(
|
||||
vec_map.as_slice(),
|
||||
&[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater() {
|
||||
let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn vec_map_from_unsorted_greater_or_equal() {
|
||||
let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
|
||||
let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use async_compression::{
|
||||
tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
|
||||
zstd::CParameter,
|
||||
Level,
|
||||
};
|
||||
use camino::Utf8Path;
|
||||
use nix::NixPath;
|
||||
use tokio::{
|
||||
fs::{File, OpenOptions},
|
||||
io::AsyncBufRead,
|
||||
io::AsyncSeekExt,
|
||||
io::AsyncWriteExt,
|
||||
};
|
||||
use tokio_tar::{Archive, Builder, HeaderMode};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tarball)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tarball}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for p in paths {
|
||||
let rel_path = p.strip_prefix(path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&p, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
/// Creates a Zstandard tarball.
|
||||
pub async fn extract_zst_tarball(
|
||||
path: &Utf8Path,
|
||||
tarball: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let decoder = Box::pin(ZstdDecoder::new(tarball));
|
||||
let mut archive = Archive::new(decoder);
|
||||
archive.unpack(path).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
|
||||
@@ -89,9 +89,6 @@ enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
|
||||
@@ -7,7 +7,7 @@ use utils::{
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub struct Client {
|
||||
mgmt_api_endpoint: String,
|
||||
authorization_header: Option<String>,
|
||||
@@ -24,9 +24,6 @@ pub enum Error {
|
||||
|
||||
#[error("pageserver API: {1}")]
|
||||
ApiError(StatusCode, String),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -169,7 +166,7 @@ impl Client {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
@@ -181,16 +178,7 @@ impl Client {
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let res = self.request_noerror(method, uri, body).await?;
|
||||
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
|
||||
let response = res.error_from_body().await?;
|
||||
Ok(response)
|
||||
}
|
||||
@@ -249,26 +237,13 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
wait: Option<std::time::Duration>,
|
||||
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/secondary/download",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if let Some(wait) = wait {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_ms", &format!("{}", wait.as_millis()));
|
||||
}
|
||||
|
||||
let response = self.request(Method::POST, path, ()).await?;
|
||||
let status = response.status();
|
||||
let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok((status, progress))
|
||||
);
|
||||
self.request(Method::POST, &uri, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
@@ -276,30 +251,21 @@ impl Client {
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<std::time::Duration>,
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: Some(tenant_shard_id),
|
||||
tenant_id: tenant_shard_id,
|
||||
config,
|
||||
};
|
||||
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
let path = format!(
|
||||
"{}/v1/tenant/{}/location_config",
|
||||
self.mgmt_api_endpoint, tenant_shard_id
|
||||
))
|
||||
// Should always work: mgmt_api_endpoint is configuration, not user input.
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if lazy {
|
||||
path.query_pairs_mut().append_pair("lazy", "true");
|
||||
}
|
||||
|
||||
if let Some(flush_ms) = flush_ms {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("flush_ms", &format!("{}", flush_ms.as_millis()));
|
||||
}
|
||||
|
||||
self.request(Method::PUT, path, &req_body).await?;
|
||||
);
|
||||
let path = if let Some(flush_ms) = flush_ms {
|
||||
format!("{}?flush_ms={}", path, flush_ms.as_millis())
|
||||
} else {
|
||||
path
|
||||
};
|
||||
self.request(Method::PUT, &path, &req_body).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -312,21 +278,6 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<Option<LocationConfig>> {
|
||||
let path = format!(
|
||||
"{}/v1/location_config/{tenant_shard_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.request(Method::GET, &path, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -438,77 +389,4 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<LayerMapInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_evict(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn layer_ondemand_download(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::GET, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,7 +63,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
);
|
||||
|
||||
// Identify the range of LSNs that belong to this level. We assume that
|
||||
// each file in this level spans an LSN range up to 1.75x target file
|
||||
// each file in this level span an LSN range up to 1.75x target file
|
||||
// size. That should give us enough slop that if we created a slightly
|
||||
// oversized L0 layer, e.g. because flushing the in-memory layer was
|
||||
// delayed for some reason, we don't consider the oversized layer to
|
||||
@@ -248,6 +248,7 @@ enum CompactionStrategy {
|
||||
CreateImage,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // Todo
|
||||
struct CompactionJob<E: CompactionJobExecutor> {
|
||||
key_range: Range<E::Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -344,7 +345,7 @@ where
|
||||
///
|
||||
/// TODO: Currently, this is called exactly once for the level, and we
|
||||
/// decide whether to create new image layers to cover the whole level, or
|
||||
/// write a new set of deltas. In the future, this should try to partition
|
||||
/// write a new set of delta. In the future, this should try to partition
|
||||
/// the key space, and make the decision separately for each partition.
|
||||
async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
@@ -708,6 +709,18 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
//
|
||||
// Candidates:
|
||||
//
|
||||
// 1. Create an image layer, snapping to previous images
|
||||
// 2. Create a delta layer, snapping to previous images
|
||||
// 3. Create an image layer, snapping to
|
||||
//
|
||||
//
|
||||
|
||||
// Take previous partitioning, based on the image layers below.
|
||||
//
|
||||
// Candidate is at the front:
|
||||
@@ -726,10 +739,6 @@ struct WindowElement<K> {
|
||||
last_key: K, // inclusive
|
||||
accum_size: u64,
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! An LSM tree consists of multiple levels, each exponentially larger than the
|
||||
//! previous level. And each level consists of multiple "tiers". With tiered
|
||||
//! An LSM tree consists of multiple levels, each exponential larger than the
|
||||
//! previous level. And each level consists of be multiple "tiers". With tiered
|
||||
//! compaction, a level is compacted when it has accumulated more than N tiers,
|
||||
//! forming one tier on the next level.
|
||||
//!
|
||||
@@ -170,6 +170,13 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
// helper struct used in depth()
|
||||
struct Event<K> {
|
||||
key: K,
|
||||
layer_idx: usize,
|
||||
start: bool,
|
||||
}
|
||||
|
||||
impl<L> Level<L> {
|
||||
/// Count the number of deltas stacked on each other.
|
||||
pub fn depth<K>(&self) -> u64
|
||||
@@ -177,11 +184,6 @@ impl<L> Level<L> {
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K>,
|
||||
{
|
||||
struct Event<K> {
|
||||
key: K,
|
||||
layer_idx: usize,
|
||||
start: bool,
|
||||
}
|
||||
let mut events: Vec<Event<K>> = Vec::new();
|
||||
for (idx, l) in self.layers.iter().enumerate() {
|
||||
events.push(Event {
|
||||
@@ -200,7 +202,7 @@ impl<L> Level<L> {
|
||||
// Sweep the key space left to right. Stop at each distinct key, and
|
||||
// count the number of deltas on top of the highest image at that key.
|
||||
//
|
||||
// This is a little inefficient, as we walk through the active_set on
|
||||
// This is a little enefficient, as we walk through the active_set on
|
||||
// every key. We could increment/decrement a counter on each step
|
||||
// instead, but that'd require a bit more complex bookkeeping.
|
||||
let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
|
||||
@@ -234,7 +236,6 @@ impl<L> Level<L> {
|
||||
}
|
||||
}
|
||||
}
|
||||
debug_assert_eq!(active_set, BTreeSet::new());
|
||||
max_depth
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,12 @@
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
use async_trait::async_trait;
|
||||
use futures::Future;
|
||||
use pageserver_api::{key::Key, keyspace::key_range_size};
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Public interface. This is the main thing that the implementor needs to provide
|
||||
#[async_trait]
|
||||
pub trait CompactionJobExecutor {
|
||||
// Type system.
|
||||
//
|
||||
@@ -17,7 +17,8 @@ pub trait CompactionJobExecutor {
|
||||
// compaction doesn't distinguish whether they are stored locally or
|
||||
// remotely.
|
||||
//
|
||||
// The keyspace is defined by the CompactionKey trait.
|
||||
// The keyspace is defined by CompactionKey trait.
|
||||
//
|
||||
type Key: CompactionKey;
|
||||
|
||||
type Layer: CompactionLayer<Self::Key> + Clone;
|
||||
@@ -34,27 +35,27 @@ pub trait CompactionJobExecutor {
|
||||
// ----
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
fn get_layers(
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<Self::Layer>>> + Send;
|
||||
) -> anyhow::Result<Vec<Self::Layer>>;
|
||||
|
||||
fn get_keyspace(
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<CompactionKeySpace<Self::Key>>> + Send;
|
||||
) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
|
||||
|
||||
/// NB: This is a pretty expensive operation. In the real pageserver
|
||||
/// implementation, it downloads the layer, and keeps it resident
|
||||
/// until the DeltaLayer is dropped.
|
||||
fn downcast_delta_layer(
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &Self::Layer,
|
||||
) -> impl Future<Output = anyhow::Result<Option<Self::DeltaLayer>>> + Send;
|
||||
) -> anyhow::Result<Option<Self::DeltaLayer>>;
|
||||
|
||||
// ----
|
||||
// Functions to execute the plan
|
||||
@@ -62,33 +63,33 @@ pub trait CompactionJobExecutor {
|
||||
|
||||
/// Create a new image layer, materializing all the values in the key range,
|
||||
/// at given 'lsn'.
|
||||
fn create_image(
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Self::Key>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<()>> + Send;
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Create a new delta layer, containing all the values from 'input_layers'
|
||||
/// in the given key and LSN range.
|
||||
fn create_delta(
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Self::Key>,
|
||||
input_layers: &[Self::DeltaLayer],
|
||||
ctx: &Self::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<()>> + Send;
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Delete a layer. The compaction implementation will call this only after
|
||||
/// all the create_image() or create_delta() calls that deletion of this
|
||||
/// layer depends on have finished. But if the implementor has extra lazy
|
||||
/// background tasks, like uploading the index json file to remote storage.
|
||||
/// background tasks, like uploading the index json file to remote storage,
|
||||
/// it is the implementation's responsibility to track those.
|
||||
fn delete_layer(
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> impl Future<Output = anyhow::Result<()>> + Send;
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
|
||||
@@ -429,6 +429,7 @@ impl From<&Arc<MockImageLayer>> for MockLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type Key = Key;
|
||||
type Layer = MockLayer;
|
||||
|
||||
@@ -1,272 +0,0 @@
|
||||
use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use rand::seq::SliceRandom;
|
||||
use tracing::{debug, info};
|
||||
use utils::id::{TenantTimelineId, TimelineId};
|
||||
|
||||
use tokio::{
|
||||
sync::{mpsc, OwnedSemaphorePermit},
|
||||
task::JoinSet,
|
||||
};
|
||||
|
||||
use std::{
|
||||
num::NonZeroUsize,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
/// Evict & on-demand download random layers.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long, default_value = "1")]
|
||||
tasks_per_target: NonZeroUsize,
|
||||
#[clap(long, default_value = "1")]
|
||||
concurrency_per_target: NonZeroUsize,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use the given
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let task = rt.spawn(main_impl(args));
|
||||
rt.block_on(task).unwrap().unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
evictions: AtomicU64,
|
||||
downloads: AtomicU64,
|
||||
timeline_restarts: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn eviction_done(&self) {
|
||||
self.evictions.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn download_done(&self) {
|
||||
self.downloads.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn timeline_restart_done(&self) {
|
||||
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
if let Some(engine_str) = &args.set_io_engine {
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
tasks.spawn({
|
||||
let live_stats = Arc::clone(&live_stats);
|
||||
async move {
|
||||
let mut last_at = Instant::now();
|
||||
loop {
|
||||
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||
let now = Instant::now();
|
||||
let delta: Duration = now - last_at;
|
||||
last_at = now;
|
||||
|
||||
let LiveStats {
|
||||
evictions,
|
||||
downloads,
|
||||
timeline_restarts,
|
||||
} = &*live_stats;
|
||||
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
for tl in timelines {
|
||||
for _ in 0..args.tasks_per_target.get() {
|
||||
tasks.spawn(timeline_actor(
|
||||
args,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
tl,
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn timeline_actor(
|
||||
args: &'static Args,
|
||||
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||
timeline: TenantTimelineId,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
// TODO: support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
|
||||
struct Timeline {
|
||||
joinset: JoinSet<()>,
|
||||
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||
concurrency: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
loop {
|
||||
debug!("restarting timeline");
|
||||
let layer_map_info = mgmt_api_client
|
||||
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
let concurrency = Arc::new(tokio::sync::Semaphore::new(
|
||||
args.concurrency_per_target.get(),
|
||||
));
|
||||
|
||||
let mut joinset = JoinSet::new();
|
||||
let layers = layer_map_info
|
||||
.historic_layers
|
||||
.into_iter()
|
||||
.map(|historic_layer| {
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
joinset.spawn(layer_actor(
|
||||
tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
historic_layer,
|
||||
rx,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
tx
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut timeline = Timeline {
|
||||
joinset,
|
||||
layers,
|
||||
concurrency,
|
||||
};
|
||||
|
||||
live_stats.timeline_restart_done();
|
||||
|
||||
loop {
|
||||
assert!(!timeline.joinset.is_empty());
|
||||
if let Some(res) = timeline.joinset.try_join_next() {
|
||||
debug!(?res, "a layer actor exited, should not happen");
|
||||
timeline.joinset.shutdown().await;
|
||||
break;
|
||||
}
|
||||
|
||||
let mut permit = Some(
|
||||
Arc::clone(&timeline.concurrency)
|
||||
.acquire_owned()
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
loop {
|
||||
let layer_tx = {
|
||||
let mut rng = rand::thread_rng();
|
||||
timeline.layers.choose_mut(&mut rng).expect("no layers")
|
||||
};
|
||||
match layer_tx.try_send(permit.take().unwrap()) {
|
||||
Ok(_) => break,
|
||||
Err(e) => match e {
|
||||
mpsc::error::TrySendError::Full(back) => {
|
||||
// TODO: retrying introduces bias away from slow downloaders
|
||||
permit.replace(back);
|
||||
}
|
||||
mpsc::error::TrySendError::Closed(_) => panic!(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn layer_actor(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
mut layer: HistoricLayerInfo,
|
||||
mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
|
||||
mgmt_api_client: Arc<mgmt_api::Client>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
#[derive(Clone, Copy)]
|
||||
enum Action {
|
||||
Evict,
|
||||
OnDemandDownload,
|
||||
}
|
||||
|
||||
while let Some(_permit) = rx.recv().await {
|
||||
let action = if layer.is_remote() {
|
||||
Action::OnDemandDownload
|
||||
} else {
|
||||
Action::Evict
|
||||
};
|
||||
|
||||
let did_it = match action {
|
||||
Action::Evict => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.eviction_done();
|
||||
did_it
|
||||
}
|
||||
Action::OnDemandDownload => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.download_done();
|
||||
did_it
|
||||
}
|
||||
};
|
||||
if !did_it {
|
||||
debug!("local copy of layer map appears out of sync, re-downloading");
|
||||
return;
|
||||
}
|
||||
debug!("did it");
|
||||
layer.set_remote(match action {
|
||||
Action::Evict => true,
|
||||
Action::OnDemandDownload => false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,6 @@ mod util {
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
@@ -26,7 +25,6 @@ enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -45,7 +43,6 @@ fn main() {
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#![recursion_limit = "300"]
|
||||
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::env::{var, VarError};
|
||||
@@ -120,9 +118,6 @@ fn main() -> anyhow::Result<()> {
|
||||
&[("node_id", &conf.id.to_string())],
|
||||
);
|
||||
|
||||
// after setting up logging, log the effective IO engine choice
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
@@ -317,7 +312,6 @@ fn start_pageserver(
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
@@ -550,7 +544,7 @@ fn start_pageserver(
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
tenant_manager,
|
||||
http_auth.clone(),
|
||||
remote_storage.clone(),
|
||||
broker_client.clone(),
|
||||
@@ -694,7 +688,6 @@ fn start_pageserver(
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
|
||||
@@ -7,9 +7,8 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
use std::{collections::HashMap, env};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -30,17 +29,18 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
@@ -83,10 +83,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
@@ -290,49 +286,21 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
// and avoid possible confusion with nested options
|
||||
#[derive(Clone, Default)]
|
||||
pub enum BuilderValue<T> {
|
||||
Set(T),
|
||||
#[default]
|
||||
NotSet,
|
||||
}
|
||||
|
||||
impl<T: Clone> BuilderValue<T> {
|
||||
pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
|
||||
impl<T> BuilderValue<T> {
|
||||
pub fn ok_or<E>(self, err: E) -> Result<T, E> {
|
||||
match self {
|
||||
Self::Set(v) => Ok(v.clone()),
|
||||
Self::NotSet => match default {
|
||||
BuilderValue::Set(v) => Ok(v.clone()),
|
||||
BuilderValue::NotSet => {
|
||||
anyhow::bail!("missing config value {field_name:?}")
|
||||
}
|
||||
},
|
||||
Self::Set(v) => Ok(v),
|
||||
Self::NotSet => Err(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(serde::Deserialize)]
|
||||
pub(crate) struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub(crate) postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub(crate) postgres_port: u16,
|
||||
pub(crate) http_host: String,
|
||||
pub(crate) http_port: u16,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
#[serde(flatten)]
|
||||
pub(crate) other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
// needed to simplify config construction
|
||||
#[derive(Default)]
|
||||
struct PageServerConfigBuilder {
|
||||
listen_pg_addr: BuilderValue<String>,
|
||||
|
||||
@@ -400,9 +368,8 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
#[inline(always)]
|
||||
fn default_values() -> Self {
|
||||
impl Default for PageServerConfigBuilder {
|
||||
fn default() -> Self {
|
||||
use self::BuilderValue::*;
|
||||
use defaults::*;
|
||||
Self {
|
||||
@@ -655,96 +622,125 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
macro_rules! conf {
|
||||
(USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
|
||||
PageServerConf {
|
||||
$(
|
||||
$field: self.$field.ok_or(stringify!($field), default.$field)?,
|
||||
)*
|
||||
$(
|
||||
$custom_field: $custom_value,
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Ok(conf!(
|
||||
USING DEFAULT
|
||||
{
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
workdir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type,
|
||||
pg_auth_type,
|
||||
auth_validation_public_key_path,
|
||||
remote_storage_config,
|
||||
id,
|
||||
broker_endpoint,
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
control_plane_api_token,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||
self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or("concurrent_tenant_warmpup",
|
||||
default.concurrent_tenant_warmup)?
|
||||
}),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("concurrent_tenant_size_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
// re-use `concurrent_tenant_size_logical_size_queries`
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("eviction_task_immitated_concurrent_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?,
|
||||
),
|
||||
virtual_file_io_engine: match self.virtual_file_io_engine {
|
||||
BuilderValue::Set(v) => v,
|
||||
BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
|
||||
io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
|
||||
io_engine::FeatureTestResult::Worse { engine, remark } => {
|
||||
// TODO: bubble this up to the caller so we can tracing::warn! it.
|
||||
eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
|
||||
engine
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
))
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?;
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
availability_zone: self
|
||||
.availability_zone
|
||||
.ok_or(anyhow!("missing availability_zone"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
http_auth_type: self
|
||||
.http_auth_type
|
||||
.ok_or(anyhow!("missing http_auth_type"))?,
|
||||
pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: self
|
||||
.broker_endpoint
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_keepalive_interval: self
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
cached_metric_collection_interval: self
|
||||
.cached_metric_collection_interval
|
||||
.ok_or(anyhow!("missing cached_metric_collection_interval"))?,
|
||||
metric_collection_endpoint: self
|
||||
.metric_collection_endpoint
|
||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
disk_usage_based_eviction: self
|
||||
.disk_usage_based_eviction
|
||||
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
ondemand_download_behavior_treat_error_as_warn: self
|
||||
.ondemand_download_behavior_treat_error_as_warn
|
||||
.ok_or(anyhow!(
|
||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
||||
))?,
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
control_plane_api: self
|
||||
.control_plane_api
|
||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
||||
control_plane_api_token: self
|
||||
.control_plane_api_token
|
||||
.ok_or(anyhow!("missing control_plane_api_token"))?,
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
secondary_download_concurrency: self
|
||||
.secondary_download_concurrency
|
||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
max_vectored_read_bytes: self
|
||||
.max_vectored_read_bytes
|
||||
.ok_or(anyhow!("missing max_vectored_read_bytes"))?,
|
||||
validate_vectored_get: self
|
||||
.validate_vectored_get
|
||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -761,10 +757,6 @@ impl PageServerConf {
|
||||
self.workdir.join("deletion")
|
||||
}
|
||||
|
||||
pub fn metadata_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("metadata.json")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
@@ -824,7 +816,18 @@ impl PageServerConf {
|
||||
.join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub(crate) fn timeline_delete_mark_file_path(
|
||||
pub fn timeline_uninit_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -835,10 +838,7 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_deleted_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
@@ -88,16 +88,13 @@
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
pub(crate) mod optional_counter;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
|
||||
}
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
@@ -153,7 +150,6 @@ impl RequestContextBuilder {
|
||||
download_behavior: DownloadBehavior::Download,
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
page_content_kind: PageContentKind::Unknown,
|
||||
micros_spent_throttled: Default::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -167,7 +163,6 @@ impl RequestContextBuilder {
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
page_content_kind: original.page_content_kind,
|
||||
micros_spent_throttled: Default::default(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
use std::{
|
||||
sync::atomic::{AtomicU32, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CounterU32 {
|
||||
inner: AtomicU32,
|
||||
}
|
||||
impl Default for CounterU32 {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: AtomicU32::new(u32::MAX),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl CounterU32 {
|
||||
pub fn open(&self) -> Result<(), &'static str> {
|
||||
match self
|
||||
.inner
|
||||
.compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
|
||||
{
|
||||
Ok(_) => Ok(()),
|
||||
Err(_) => Err("open() called on clsoed state"),
|
||||
}
|
||||
}
|
||||
pub fn close(&self) -> Result<u32, &'static str> {
|
||||
match self.inner.swap(u32::MAX, Ordering::Relaxed) {
|
||||
u32::MAX => Err("close() called on closed state"),
|
||||
x => Ok(x),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&self, count: u32) -> Result<(), &'static str> {
|
||||
if count == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
let mut had_err = None;
|
||||
self.inner
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
|
||||
u32::MAX => {
|
||||
had_err = Some("add() called on closed state");
|
||||
None
|
||||
}
|
||||
x => {
|
||||
let (new, overflowed) = x.overflowing_add(count);
|
||||
if new == u32::MAX || overflowed {
|
||||
had_err = Some("add() overflowed the counter");
|
||||
None
|
||||
} else {
|
||||
Some(new)
|
||||
}
|
||||
}
|
||||
})
|
||||
.map_err(|_| had_err.expect("we set it whenever the function returns None"))
|
||||
.map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct MicroSecondsCounterU32 {
|
||||
inner: CounterU32,
|
||||
}
|
||||
|
||||
impl MicroSecondsCounterU32 {
|
||||
pub fn open(&self) -> Result<(), &'static str> {
|
||||
self.inner.open()
|
||||
}
|
||||
pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
|
||||
match duration.as_micros().try_into() {
|
||||
Ok(x) => self.inner.add(x),
|
||||
Err(_) => Err("add(): duration conversion error"),
|
||||
}
|
||||
}
|
||||
pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
|
||||
let val = self.inner.close()?;
|
||||
let val = Duration::from_micros(val as u64);
|
||||
let subbed = match from.checked_sub(val) {
|
||||
Some(v) => v,
|
||||
None => return Err("Duration::checked_sub"),
|
||||
};
|
||||
Ok(subbed)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_basic() {
|
||||
let counter = MicroSecondsCounterU32::default();
|
||||
counter.open().unwrap();
|
||||
counter.add(Duration::from_micros(23)).unwrap();
|
||||
let res = counter
|
||||
.close_and_checked_sub_from(Duration::from_micros(42))
|
||||
.unwrap();
|
||||
assert_eq!(res, Duration::from_micros(42 - 23));
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,6 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
@@ -13,10 +12,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
virtual_file::on_fatal_io_error,
|
||||
};
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
@@ -36,7 +32,6 @@ pub enum RetryForeverError {
|
||||
pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
@@ -115,59 +110,13 @@ impl ControlPlaneClient {
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
|
||||
// Include registration content in the re-attach request if a metadata file is readable
|
||||
let metadata_path = conf.metadata_path();
|
||||
let register = match tokio::fs::read_to_string(&metadata_path).await {
|
||||
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||
Ok(m) => {
|
||||
// Since we run one time at startup, be generous in our logging and
|
||||
// dump all metadata.
|
||||
tracing::info!(
|
||||
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||
m.postgres_host,
|
||||
m.postgres_port,
|
||||
m.http_host,
|
||||
m.http_port,
|
||||
m.other
|
||||
);
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Unreadable metadata in {metadata_path}: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// This is legal: we may have been deployed with some external script
|
||||
// doing registration for us.
|
||||
tracing::info!("Metadata file not found at {metadata_path}");
|
||||
} else {
|
||||
on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
|
||||
}
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let request = ReAttachRequest {
|
||||
node_id: self.node_id,
|
||||
register,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
@@ -831,10 +831,7 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
async fn validate(
|
||||
|
||||
@@ -567,9 +567,9 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
/v1/tenant/{tenant_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
@@ -932,75 +932,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/heatmap_upload:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in an attached mode, upload the current state to the remote heatmap
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: wait_ms
|
||||
description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
post:
|
||||
description: |
|
||||
If the location is in secondary mode, download latest heatmap and layers
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"202":
|
||||
description: Download has started but not yet finished
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1383,11 +1314,10 @@ components:
|
||||
TenantLocationConfigRequest:
|
||||
type: object
|
||||
required:
|
||||
- mode
|
||||
- tenant_id
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
description: Not used, scheduled for removal.
|
||||
mode:
|
||||
type: string
|
||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||
@@ -1409,10 +1339,6 @@ components:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantShardLocation"
|
||||
stripe_size:
|
||||
description: If multiple shards are present, this field contains the sharding stripe size, else it is null.
|
||||
type: integer
|
||||
nullable: true
|
||||
TenantShardLocation:
|
||||
type: object
|
||||
required:
|
||||
@@ -1461,7 +1387,7 @@ components:
|
||||
trace_read_requests:
|
||||
type: boolean
|
||||
heatmap_period:
|
||||
type: string
|
||||
type: integer
|
||||
TenantConfigResponse:
|
||||
type: object
|
||||
properties:
|
||||
@@ -1639,37 +1565,6 @@ components:
|
||||
Lower is better score for how good this pageserver would be for the next tenant.
|
||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||
|
||||
SecondaryProgress:
|
||||
type: object
|
||||
required:
|
||||
- heatmap_mtime
|
||||
- layers_downloaded
|
||||
- layers_total
|
||||
- bytes_downloaded
|
||||
- bytes_total
|
||||
properties:
|
||||
heatmap_mtime:
|
||||
type: string
|
||||
format: date-time
|
||||
description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
|
||||
layers_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers from the latest layer heatmap are present on disk
|
||||
bytes_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content from the latest layer heatmap are present on disk
|
||||
layers_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers were in the latest layer heatmap
|
||||
bytes_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -14,7 +14,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::LocationConfig;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
@@ -535,9 +534,9 @@ async fn timeline_create_handler(
|
||||
)
|
||||
}
|
||||
Err(
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
tenant::CreateTimelineError::Conflict
|
||||
| tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, ()),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -885,16 +884,14 @@ async fn tenant_detach_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
state
|
||||
.tenant_manager
|
||||
.detach_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
mgr::detach_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1153,12 +1150,7 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
)
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1405,9 +1397,7 @@ async fn update_tenant_config_handler(
|
||||
TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.tenant_manager
|
||||
.set_new_tenant_config(tenant_conf, tenant_id)
|
||||
mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", %tenant_id))
|
||||
.await?;
|
||||
|
||||
@@ -1432,14 +1422,13 @@ async fn put_tenant_location_config_handler(
|
||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||
// its local disk content and drops it from memory.
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
if let Err(e) = state
|
||||
.tenant_manager
|
||||
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await
|
||||
if let Err(e) =
|
||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||
@@ -1462,12 +1451,11 @@ async fn put_tenant_location_config_handler(
|
||||
tenant::SpawnMode::Eager
|
||||
};
|
||||
|
||||
let tenant = state
|
||||
let attached = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
|
||||
.await?;
|
||||
let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size());
|
||||
let attached = tenant.is_some();
|
||||
.await?
|
||||
.is_some();
|
||||
|
||||
if let Some(_flush_ms) = flush {
|
||||
match state
|
||||
@@ -1489,20 +1477,12 @@ async fn put_tenant_location_config_handler(
|
||||
// This API returns a vector of pageservers where the tenant is attached: this is
|
||||
// primarily for use in the sharding service. For compatibilty, we also return this
|
||||
// when called directly on a pageserver, but the payload is always zero or one shards.
|
||||
let mut response = TenantLocationConfigResponse {
|
||||
shards: Vec::new(),
|
||||
stripe_size: None,
|
||||
};
|
||||
let mut response = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
if attached {
|
||||
response.shards.push(TenantShardLocation {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state.conf.id,
|
||||
});
|
||||
if tenant_shard_id.shard_count.count() > 1 {
|
||||
// Stripe size should be set if we are attached
|
||||
debug_assert!(stripe_size.is_some());
|
||||
response.stripe_size = stripe_size;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
@@ -1530,29 +1510,6 @@ async fn list_location_config_handler(
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn get_location_config_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let slot = state.tenant_manager.get(tenant_shard_id);
|
||||
|
||||
let Some(slot) = slot else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant shard not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
let result: Option<LocationConfig> = match slot {
|
||||
TenantSlot::Attached(t) => Some(t.get_location_conf()),
|
||||
TenantSlot::Secondary(s) => Some(s.get_location_conf()),
|
||||
TenantSlot::InProgress(_) => None,
|
||||
};
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
|
||||
// (from all pageservers) as it invalidates consistency assumptions.
|
||||
async fn tenant_time_travel_remote_storage_handler(
|
||||
@@ -1653,7 +1610,8 @@ async fn timeline_gc_handler(
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
|
||||
let wait_task_done =
|
||||
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
.context("wait for gc task")
|
||||
@@ -1991,42 +1949,13 @@ async fn secondary_download_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// We don't need this to issue the download request, but:
|
||||
// - it enables us to cleanly return 404 if we get a request for an absent shard
|
||||
// - we will use this to provide status feedback in the response
|
||||
let Some(secondary_tenant) = state
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(tenant_shard_id)
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
|
||||
));
|
||||
};
|
||||
|
||||
let timeout = wait.unwrap_or(Duration::MAX);
|
||||
|
||||
let status = match tokio::time::timeout(
|
||||
timeout,
|
||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// Download job ran to completion.
|
||||
Ok(Ok(())) => StatusCode::OK,
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
|
||||
// A timeout is not an error: we have started the download, we're just not done
|
||||
// yet. The caller will get a response body indicating status.
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(status, progress)
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -2086,10 +2015,6 @@ async fn get_utilization(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
fail::fail_point!("get-utilization-http-handler", |_| {
|
||||
Err(ApiError::ResourceUnavailable("failpoint".into()))
|
||||
});
|
||||
|
||||
// this probably could be completely public, but lets make that change later.
|
||||
check_permission(&r, None)?;
|
||||
|
||||
@@ -2145,16 +2070,6 @@ where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
|
||||
fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
|
||||
"failpoint".into()
|
||||
)));
|
||||
|
||||
fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
|
||||
anyhow::anyhow!("failpoint")
|
||||
)));
|
||||
}
|
||||
|
||||
// Spawn a new task to handle the request, to protect the handler from unexpected
|
||||
// async cancellations. Most pageserver functions are not async cancellation safe.
|
||||
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
|
||||
@@ -2299,9 +2214,6 @@ pub fn make_router(
|
||||
.get("/v1/location_config", |r| {
|
||||
api_handler(r, list_location_config_handler)
|
||||
})
|
||||
.get("/v1/location_config/:tenant_shard_id", |r| {
|
||||
api_handler(r, get_location_config_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
|
||||
|r| api_handler(r, tenant_time_travel_remote_storage_handler),
|
||||
|
||||
@@ -2,20 +2,28 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use nix::NixPath;
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
@@ -625,3 +633,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(pgdata_path) {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
// Also allow directories so that we also get empty directories
|
||||
if !(metadata.is_file() || metadata.is_dir()) {
|
||||
continue;
|
||||
}
|
||||
let path = entry.into_path();
|
||||
paths.push(path);
|
||||
}
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
let mut builder = Builder::new(zstd);
|
||||
// Use reproducible header mode
|
||||
builder.mode(HeaderMode::Deterministic);
|
||||
for path in paths {
|
||||
let rel_path = path.strip_prefix(pgdata_path)?;
|
||||
if rel_path.is_empty() {
|
||||
// The top directory should not be compressed,
|
||||
// the tar crate doesn't like that
|
||||
continue;
|
||||
}
|
||||
builder.append_path_with_name(&path, rel_path).await?;
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||
}
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
pgdata_path: &Utf8Path,
|
||||
tar_zst: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(pgdata_path).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -31,7 +31,6 @@ pub mod walredo;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::mgr::TenantManager;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -54,11 +53,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
pub async fn shutdown_pageserver(
|
||||
tenant_manager: &TenantManager,
|
||||
deletion_queue: Option<DeletionQueue>,
|
||||
exit_code: i32,
|
||||
) {
|
||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -72,7 +67,7 @@ pub async fn shutdown_pageserver(
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
tenant_manager.shutdown(),
|
||||
tenant::mgr::shutdown_all_tenants(),
|
||||
"shutdown all tenants",
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
@@ -119,27 +114,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
/// tenant path while in secondary mode.
|
||||
pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
/// A suffix used for various temporary files. Any temporary files found in the
|
||||
/// data directory at pageserver startup can be automatically removed.
|
||||
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
/// A marker file to mark that a timeline directory was not fully initialized.
|
||||
/// If a timeline directory with this marker is encountered at pageserver startup,
|
||||
/// the timeline directory and the marker file are both removed.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
@@ -166,11 +161,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||
// from the name.
|
||||
|
||||
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use enum_map::EnumMap;
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
@@ -10,7 +11,6 @@ use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::{EnumCount, IntoEnumIterator, VariantNames};
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use tracing::warn;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
/// Prometheus histogram buckets (in seconds) for operations in the critical
|
||||
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
|
||||
"Time spent in get_vectored",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
@@ -1005,39 +1005,15 @@ impl GlobalAndPerTimelineHistogram {
|
||||
}
|
||||
}
|
||||
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
struct GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
h: &'a GlobalAndPerTimelineHistogram,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
op: SmgrQueryType,
|
||||
}
|
||||
|
||||
impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(res) => res,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[self.op];
|
||||
rate_limit.call(|| {
|
||||
warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
self.h.observe(ex_throttled.as_secs_f64());
|
||||
self.h.observe(elapsed.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1049,7 +1025,6 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
|
||||
strum_macros::EnumCount,
|
||||
strum_macros::EnumIter,
|
||||
strum_macros::FromRepr,
|
||||
enum_map::Enum,
|
||||
)]
|
||||
#[strum(serialize_all = "snake_case")]
|
||||
pub enum SmgrQueryType {
|
||||
@@ -1155,35 +1130,11 @@ impl SmgrQueryTimePerTimeline {
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
pub(crate) fn start_timer<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
op: SmgrQueryType,
|
||||
ctx: &'c RequestContext,
|
||||
) -> impl Drop + '_ {
|
||||
pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
|
||||
let metric = &self.metrics[op as usize];
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
|
||||
Lazy::new(|| {
|
||||
Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
|
||||
RateLimit::new(Duration::from_secs(10))
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[op];
|
||||
rate_limit.call(|| {
|
||||
warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
GlobalAndPerTimelineHistogramTimer {
|
||||
h: metric,
|
||||
ctx,
|
||||
start,
|
||||
op,
|
||||
start: std::time::Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1194,11 +1145,6 @@ mod smgr_query_time_tests {
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
task_mgr::TaskKind,
|
||||
};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
@@ -1247,8 +1193,7 @@ mod smgr_query_time_tests {
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
let timer = metrics.start_timer(*op, &ctx);
|
||||
let timer = metrics.start_timer(*op);
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
@@ -1282,65 +1227,11 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
})
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
parent: &'a BasebackupQueryTime,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
BasebackupQueryTimeOngoingRecording {
|
||||
parent: self,
|
||||
ctx,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(ex_throttled) => ex_throttled,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self
|
||||
.parent
|
||||
.0
|
||||
.get_metric_with_label_values(&[label_value])
|
||||
.unwrap();
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2017,8 +1908,10 @@ impl TimelineMetrics {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn shutdown(&self) {
|
||||
impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -2465,8 +2358,7 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub mod tokio_epoll_uring {
|
||||
use metrics::{register_int_counter, UIntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
use metrics::UIntGauge;
|
||||
|
||||
pub struct Collector {
|
||||
descs: Vec<metrics::core::Desc>,
|
||||
@@ -2474,13 +2366,15 @@ pub mod tokio_epoll_uring {
|
||||
systems_destroyed: UIntGauge,
|
||||
}
|
||||
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
impl metrics::core::Collector for Collector {
|
||||
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||
self.descs.iter().collect()
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||
let mut mfs = Vec::with_capacity(NMETRICS);
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
@@ -2494,8 +2388,6 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
let mut descs = Vec::new();
|
||||
@@ -2529,22 +2421,6 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
|
||||
"Number of times where thread_local_system creation spanned multiple executor threads",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
|
||||
"Number of times thread_local_system creation failed and was retried after back-off.",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) mod tenant_throttling {
|
||||
@@ -2673,8 +2549,6 @@ pub fn preinitialize_metrics() {
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -2693,12 +2567,6 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||
|
||||
for state_name in pageserver_api::models::TenantState::VARIANTS {
|
||||
// initialize the metric for all gauges, otherwise the time series might seemingly show
|
||||
// values from last restart.
|
||||
TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -910,7 +910,7 @@ impl PageServerHandler {
|
||||
let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetRelExists);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -938,7 +938,7 @@ impl PageServerHandler {
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetRelSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -966,7 +966,7 @@ impl PageServerHandler {
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetDbSize);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -1144,7 +1144,7 @@ impl PageServerHandler {
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -1172,7 +1172,7 @@ impl PageServerHandler {
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
|
||||
.start_timer(metrics::SmgrQueryType::GetSlruSegment);
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: &RequestContext,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline.wait_lsn(lsn, &ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1449,25 +1449,25 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1563,7 +1563,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -34,7 +33,6 @@ use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
@@ -1500,7 +1498,7 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
@@ -1539,23 +1537,14 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1688,7 +1677,7 @@ struct RelDirectory {
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub(crate) struct AuxFilesDirectory {
|
||||
pub(crate) files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
@@ -50,6 +50,8 @@ use once_cell::sync::Lazy;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
//
|
||||
@@ -270,6 +272,9 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
@@ -451,7 +456,7 @@ async fn task_finish(
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
std::process::exit(1);
|
||||
shutdown_pageserver(None, 1).await;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,7 +22,6 @@ use pageserver_api::models;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -43,8 +42,6 @@ use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::timeout::TimeoutCancellableError;
|
||||
use utils::zstd::create_zst_tarball;
|
||||
use utils::zstd::extract_zst_tarball;
|
||||
|
||||
use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
@@ -57,8 +54,8 @@ use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineCreateGuard;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
@@ -567,8 +564,9 @@ impl Tenant {
|
||||
// avoiding holding it across awaits
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
Entry::Occupied(_) => {
|
||||
// The uninit mark file acts as a lock that prevents another task from
|
||||
// initializing the timeline at the same time.
|
||||
unreachable!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
);
|
||||
@@ -1065,7 +1063,8 @@ impl Tenant {
|
||||
let entry_path = entry.path();
|
||||
|
||||
let purge = if crate::is_temporary(entry_path)
|
||||
// TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
|
||||
// TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
|
||||
// covered by the check that the timeline must exist in remote storage.
|
||||
|| is_uninit_mark(entry_path)
|
||||
|| crate::is_delete_mark(entry_path)
|
||||
{
|
||||
@@ -1298,6 +1297,11 @@ impl Tenant {
|
||||
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
||||
/// and the timeline will fail to load at a restart.
|
||||
///
|
||||
/// That's why we add an uninit mark file, and wrap it together witht the Timeline
|
||||
/// in-memory object into UninitializedTimeline.
|
||||
/// Once the caller is done setting up the timeline, they should call
|
||||
/// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
|
||||
///
|
||||
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||
/// minimum amount of keys required to get a writable timeline.
|
||||
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||
@@ -1313,9 +1317,7 @@ impl Tenant {
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
|
||||
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
// make it valid, before calling finish_creation()
|
||||
@@ -1330,7 +1332,7 @@ impl Tenant {
|
||||
self.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
create_guard,
|
||||
timeline_uninit_mark,
|
||||
initdb_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -1418,8 +1420,9 @@ impl Tenant {
|
||||
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||
|
||||
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
||||
// and that no other creation attempts will be allowed in while we are working.
|
||||
let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
|
||||
// and that no other creation attempts will be allowed in while we are working. The
|
||||
// uninit_mark is a guard.
|
||||
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
|
||||
Ok(m) => m,
|
||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||
// Creation is in progress, we cannot create it again, and we cannot
|
||||
@@ -1462,8 +1465,6 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
pausable_failpoint!("timeline-creation-after-uninit");
|
||||
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = self
|
||||
@@ -1511,7 +1512,7 @@ impl Tenant {
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
create_guard,
|
||||
uninit_mark,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -1521,7 +1522,7 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
create_guard,
|
||||
uninit_mark,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -1844,8 +1845,6 @@ impl Tenant {
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2087,10 +2086,6 @@ impl Tenant {
|
||||
&self.tenant_shard_id
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
|
||||
self.shard_identity.stripe_size
|
||||
}
|
||||
|
||||
pub(crate) fn get_generation(&self) -> Generation {
|
||||
self.generation
|
||||
}
|
||||
@@ -2868,9 +2863,9 @@ impl Tenant {
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
|
||||
.await?;
|
||||
tl.set_state(TimelineState::Active);
|
||||
Ok(tl)
|
||||
@@ -2884,10 +2879,10 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -2896,7 +2891,7 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
@@ -2980,7 +2975,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_create_guard,
|
||||
timeline_uninit_mark,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)
|
||||
@@ -3012,12 +3007,12 @@ impl Tenant {
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
|
||||
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
|
||||
self.bootstrap_timeline(
|
||||
timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
create_guard,
|
||||
uninit_mark,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3044,13 +3039,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!(
|
||||
"compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
|
||||
);
|
||||
}
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
|
||||
|
||||
pausable_failpoint!("before-initdb-upload");
|
||||
|
||||
@@ -3086,7 +3076,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
@@ -3098,14 +3088,13 @@ impl Tenant {
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
// Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
|
||||
// we won't race with other creations or existent timelines with the same path.
|
||||
// an uninit mark was placed before, nothing else can access this timeline files
|
||||
// current initdb was not run yet, so remove whatever was left from the previous runs
|
||||
if pgdata_path.exists() {
|
||||
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||
})?;
|
||||
}
|
||||
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
scopeguard::defer! {
|
||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
@@ -3150,7 +3139,7 @@ impl Tenant {
|
||||
|
||||
let buf_read =
|
||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||
extract_zst_tarball(&pgdata_path, buf_read)
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
} else {
|
||||
@@ -3182,7 +3171,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_create_guard,
|
||||
timeline_uninit_mark,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -3254,12 +3243,13 @@ impl Tenant {
|
||||
///
|
||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||
/// `finish_creation` to insert the Timeline into the timelines map.
|
||||
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
|
||||
/// uninit mark file.
|
||||
async fn prepare_new_timeline<'a>(
|
||||
&'a self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
uninit_mark: TimelineUninitMark<'a>,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
@@ -3282,12 +3272,9 @@ impl Tenant {
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) = self
|
||||
.create_timeline_files(&create_guard.timeline_path)
|
||||
.await
|
||||
{
|
||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(create_guard);
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -3298,31 +3285,41 @@ impl Tenant {
|
||||
Ok(UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, create_guard)),
|
||||
Some((timeline_struct, uninit_mark)),
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-dir-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-dir-creation");
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
||||
/// concurrent attempts to create the same timeline.
|
||||
fn create_timeline_create_guard(
|
||||
/// Attempts to create an uninit mark file for the timeline initialization.
|
||||
/// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
|
||||
///
|
||||
/// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
|
||||
fn create_timeline_uninit_mark(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
||||
) -> Result<TimelineUninitMark, TimelineExclusionError> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let uninit_mark_path = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
|
||||
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
|
||||
let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
|
||||
let uninit_mark = TimelineUninitMark::new(
|
||||
self,
|
||||
timeline_id,
|
||||
uninit_mark_path.clone(),
|
||||
timeline_path.clone(),
|
||||
)?;
|
||||
|
||||
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
||||
// for creation.
|
||||
@@ -3338,7 +3335,23 @@ impl Tenant {
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(create_guard)
|
||||
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
|
||||
// that during process runtime, colliding creations will be caught in-memory without getting
|
||||
// as far as failing to write a file.
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&uninit_mark_path)
|
||||
.context("Failed to create uninit mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&uninit_mark_path)
|
||||
.context("Failed to fsync uninit mark file")
|
||||
})
|
||||
.with_context(|| {
|
||||
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
Ok(uninit_mark)
|
||||
}
|
||||
|
||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||
@@ -3539,6 +3552,11 @@ async fn run_initdb(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Drop for Tenant {
|
||||
fn drop(&mut self) {
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub async fn dump_layerfile_from_path(
|
||||
path: &Utf8Path,
|
||||
@@ -3656,10 +3674,7 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
impl TenantHarness {
|
||||
pub fn create_custom(
|
||||
test_name: &'static str,
|
||||
tenant_conf: TenantConf,
|
||||
) -> anyhow::Result<Self> {
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
setup_logging();
|
||||
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
@@ -3671,6 +3686,14 @@ pub(crate) mod harness {
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Disable automatic GC and compaction to make the unit tests more deterministic.
|
||||
// The tests perform them manually if needed.
|
||||
let tenant_conf = TenantConf {
|
||||
gc_period: Duration::ZERO,
|
||||
compaction_period: Duration::ZERO,
|
||||
..TenantConf::default()
|
||||
};
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
|
||||
@@ -3698,18 +3721,6 @@ pub(crate) mod harness {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
// Disable automatic GC and compaction to make the unit tests more deterministic.
|
||||
// The tests perform them manually if needed.
|
||||
let tenant_conf = TenantConf {
|
||||
gc_period: Duration::ZERO,
|
||||
compaction_period: Duration::ZERO,
|
||||
..TenantConf::default()
|
||||
};
|
||||
|
||||
Self::create_custom(test_name, tenant_conf)
|
||||
}
|
||||
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
@@ -3817,7 +3828,6 @@ mod tests {
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
@@ -3834,7 +3844,7 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -3846,7 +3856,7 @@ mod tests {
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -3912,7 +3922,7 @@ mod tests {
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
@@ -3946,7 +3956,7 @@ mod tests {
|
||||
let newtline = tenant
|
||||
.get_timeline(NEW_TIMELINE_ID, true)
|
||||
.expect("Should have a local timeline");
|
||||
let mut new_writer = newtline.writer().await;
|
||||
let new_writer = newtline.writer().await;
|
||||
new_writer
|
||||
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
|
||||
.await?;
|
||||
@@ -3978,7 +3988,7 @@ mod tests {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
// Create a relation on the timeline
|
||||
writer
|
||||
.put(
|
||||
@@ -4003,7 +4013,7 @@ mod tests {
|
||||
}
|
||||
tline.freeze_and_flush().await?;
|
||||
{
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4366,7 +4376,7 @@ mod tests {
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4383,7 +4393,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4400,7 +4410,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4417,7 +4427,7 @@ mod tests {
|
||||
.compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
*TEST_KEY,
|
||||
@@ -4474,7 +4484,7 @@ mod tests {
|
||||
for _ in 0..repeat {
|
||||
for _ in 0..key_count {
|
||||
test_key.field6 = blknum;
|
||||
let mut writer = timeline.writer().await;
|
||||
let writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4605,7 +4615,10 @@ mod tests {
|
||||
drop(guard);
|
||||
|
||||
// Pick a big LSN such that we query over all the changes.
|
||||
let reads_lsn = Lsn(u64::MAX - 1);
|
||||
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
|
||||
// but there seems to be a bug on the non-vectored search path which surfaces
|
||||
// in that case.
|
||||
let reads_lsn = Lsn(u64::MAX - 1000);
|
||||
|
||||
for read in reads {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
@@ -4619,145 +4632,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test that vectored get handles layer gaps correctly
|
||||
// by advancing into the next ancestor timeline if required.
|
||||
//
|
||||
// The test generates timelines that look like the diagram below.
|
||||
// We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
|
||||
// The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
|
||||
//
|
||||
// ```
|
||||
//-------------------------------+
|
||||
// ... |
|
||||
// [ L1 ] |
|
||||
// [ / L1 ] | Child Timeline
|
||||
// ... |
|
||||
// ------------------------------+
|
||||
// [ X L1 ] | Parent Timeline
|
||||
// ------------------------------+
|
||||
// ```
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
|
||||
let tenant_conf = TenantConf {
|
||||
// Make compaction deterministic
|
||||
gc_period: Duration::ZERO,
|
||||
compaction_period: Duration::ZERO,
|
||||
// Encourage creation of L1 layers
|
||||
checkpoint_distance: 16 * 1024,
|
||||
compaction_target_size: 8 * 1024,
|
||||
..TenantConf::default()
|
||||
};
|
||||
|
||||
let harness = TenantHarness::create_custom("test_get_vectored_key_gap", tenant_conf)?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let gap_at_key = current_key.add(100);
|
||||
let mut current_lsn = Lsn(0x10);
|
||||
|
||||
const KEY_COUNT: usize = 10_000;
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let current_timeline = tenant
|
||||
.create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
current_lsn += 0x100;
|
||||
|
||||
let mut writer = current_timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
gap_at_key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(current_lsn);
|
||||
drop(writer);
|
||||
|
||||
let mut latest_lsns = HashMap::new();
|
||||
latest_lsns.insert(gap_at_key, current_lsn);
|
||||
|
||||
current_timeline.freeze_and_flush().await?;
|
||||
|
||||
let child_timeline_id = TimelineId::generate();
|
||||
|
||||
tenant
|
||||
.branch_timeline_test(
|
||||
¤t_timeline,
|
||||
child_timeline_id,
|
||||
Some(current_lsn),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
let child_timeline = tenant
|
||||
.get_timeline(child_timeline_id, true)
|
||||
.expect("Should have the branched timeline");
|
||||
|
||||
for i in 0..KEY_COUNT {
|
||||
if current_key == gap_at_key {
|
||||
current_key = current_key.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
current_lsn += 0x10;
|
||||
|
||||
let mut writer = child_timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
current_key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(current_lsn);
|
||||
drop(writer);
|
||||
|
||||
latest_lsns.insert(current_key, current_lsn);
|
||||
current_key = current_key.next();
|
||||
|
||||
// Flush every now and then to encourage layer file creation.
|
||||
if i % 500 == 0 {
|
||||
child_timeline.freeze_and_flush().await?;
|
||||
}
|
||||
}
|
||||
|
||||
child_timeline.freeze_and_flush().await?;
|
||||
let mut flags = EnumSet::new();
|
||||
flags.insert(CompactFlags::ForceRepartition);
|
||||
child_timeline
|
||||
.compact(&CancellationToken::new(), flags, &ctx)
|
||||
.await?;
|
||||
|
||||
let key_near_end = {
|
||||
let mut tmp = current_key;
|
||||
tmp.field6 -= 10;
|
||||
tmp
|
||||
};
|
||||
|
||||
let key_near_gap = {
|
||||
let mut tmp = gap_at_key;
|
||||
tmp.field6 -= 10;
|
||||
tmp
|
||||
};
|
||||
|
||||
let read = KeySpace {
|
||||
ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
|
||||
};
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(read.clone(), current_lsn, &ctx)
|
||||
.await?;
|
||||
|
||||
for (key, img_res) in results {
|
||||
let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
|
||||
assert_eq!(img_res?, expected);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
@@ -4781,7 +4655,7 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4802,7 +4676,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4870,7 +4744,7 @@ mod tests {
|
||||
for blknum in 0..NUM_KEYS {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4899,7 +4773,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -4976,7 +4850,7 @@ mod tests {
|
||||
lsn = Lsn(lsn.0 + 0x10);
|
||||
let blknum = thread_rng().gen_range(0..NUM_KEYS);
|
||||
test_key.field6 = blknum as u32;
|
||||
let mut writer = tline.writer().await;
|
||||
let writer = tline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
test_key,
|
||||
@@ -5079,15 +4953,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||
let name = "test_create_guard_crash";
|
||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
||||
let name = "test_uninit_mark_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
// Keeps uninit mark in place
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
@@ -5115,24 +4989,10 @@ mod tests {
|
||||
.timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let read_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//!
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
@@ -127,7 +127,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
/// You need to make sure that the internal buffer is empty, otherwise
|
||||
/// data will be written in wrong order.
|
||||
#[inline(always)]
|
||||
async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
async fn write_all_unbuffered<B: BoundedBuf>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
@@ -162,10 +162,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
}
|
||||
|
||||
/// Internal, possibly buffered, write function
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
src_buf: B,
|
||||
) -> (B::Buf, Result<(), Error>) {
|
||||
async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
|
||||
if !BUFFERED {
|
||||
assert!(self.buf.is_empty());
|
||||
return self.write_all_unbuffered(src_buf).await;
|
||||
@@ -213,10 +210,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
|
||||
/// Write a blob of data. Returns the offset that it was written to,
|
||||
/// which can be used to retrieve the data later.
|
||||
pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
srcbuf: B,
|
||||
) -> (B::Buf, Result<u64, Error>) {
|
||||
pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
|
||||
let offset = self.offset;
|
||||
|
||||
let len = srcbuf.bytes_init();
|
||||
|
||||
@@ -354,7 +354,6 @@ pub struct TenantConf {
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
|
||||
@@ -296,7 +296,6 @@ impl DeleteTenantFlow {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -304,9 +303,7 @@ impl DeleteTenantFlow {
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) =
|
||||
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
|
||||
{
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
@@ -325,7 +322,6 @@ impl DeleteTenantFlow {
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -339,9 +335,15 @@ impl DeleteTenantFlow {
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
create_remote_delete_mark(
|
||||
conf,
|
||||
remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
@@ -544,7 +546,8 @@ impl DeleteTenantFlow {
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant.tenant_shard_id,
|
||||
&task_mgr::shutdown_token(),
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -18,19 +18,10 @@
|
||||
//! - An Iterator interface would be more convenient for the callers than the
|
||||
//! 'visit' function
|
||||
//!
|
||||
use async_stream::try_stream;
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use either::Either;
|
||||
use futures::Stream;
|
||||
use hex;
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
io,
|
||||
iter::Rev,
|
||||
ops::{Range, RangeInclusive},
|
||||
result,
|
||||
};
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
|
||||
@@ -259,90 +250,6 @@ where
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return a stream which yields all key, value pairs from the index
|
||||
/// starting from the first key greater or equal to `start_key`.
|
||||
///
|
||||
/// Note that this is a copy of [`Self::visit`].
|
||||
/// TODO: Once the sequential read path is removed this will become
|
||||
/// the only index traversal method.
|
||||
pub fn get_stream_from<'a>(
|
||||
&'a self,
|
||||
start_key: &'a [u8; L],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a {
|
||||
try_stream! {
|
||||
let mut stack = Vec::new();
|
||||
stack.push((self.root_blk, None));
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
while let Some((node_blknum, opt_iter)) = stack.pop() {
|
||||
// Locate the node.
|
||||
let node_buf = block_cursor
|
||||
.read_blk(self.start_blk + node_blknum, ctx)
|
||||
.await?;
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
assert!(node.num_children > 0);
|
||||
|
||||
let mut keybuf = Vec::new();
|
||||
keybuf.extend(node.prefix);
|
||||
keybuf.resize(prefix_len + suffix_len, 0);
|
||||
|
||||
let mut iter: Either<Range<usize>, Rev<RangeInclusive<usize>>> = if let Some(iter) = opt_iter {
|
||||
iter
|
||||
} else {
|
||||
// Locate the first match
|
||||
let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => {
|
||||
if node.level == 0 {
|
||||
// Imagine that the node contains the following keys:
|
||||
//
|
||||
// 1
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
//
|
||||
// If the search key is '2' and there is exact match,
|
||||
// the binary search would return the index of key
|
||||
// '3'. That's cool, '3' is the first key to return.
|
||||
idx
|
||||
} else {
|
||||
// This is an internal page, so each key represents a lower
|
||||
// bound for what's in the child page. If there is no exact
|
||||
// match, we have to return the *previous* entry.
|
||||
//
|
||||
// 1 <-- return this
|
||||
// 3 <-- idx
|
||||
// 5
|
||||
idx.saturating_sub(1)
|
||||
}
|
||||
}
|
||||
};
|
||||
Either::Left(idx..node.num_children.into())
|
||||
};
|
||||
|
||||
// idx points to the first match now. Keep going from there
|
||||
while let Some(idx) = iter.next() {
|
||||
let key_off = idx * suffix_len;
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx);
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if node.level == 0 {
|
||||
// leaf
|
||||
yield (keybuf.clone(), value.to_u64());
|
||||
} else {
|
||||
stack.push((node_blknum, Some(iter)));
|
||||
stack.push((value.to_blknum(), None));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
|
||||
/// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
|
||||
|
||||
@@ -460,22 +460,15 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
|
||||
let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
|
||||
Some(version) => version,
|
||||
None => {
|
||||
let mut result = RangeSearchResult::new();
|
||||
result.not_found.add_range(key_range);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
|
||||
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
|
||||
|
||||
let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
|
||||
let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
|
||||
let image_changes = version.image_coverage.range_overlaps(&raw_range);
|
||||
|
||||
let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
|
||||
collector.collect()
|
||||
Some(collector.collect())
|
||||
}
|
||||
|
||||
/// Start a batch of updates, applied on drop
|
||||
@@ -1002,13 +995,8 @@ mod tests {
|
||||
let layer_map = LayerMap::default();
|
||||
let range = Key::from_i128(100)..Key::from_i128(200);
|
||||
|
||||
let res = layer_map.range_search(range.clone(), Lsn(100));
|
||||
assert_eq!(
|
||||
res.not_found.to_keyspace(),
|
||||
KeySpace {
|
||||
ranges: vec![range]
|
||||
}
|
||||
);
|
||||
let res = layer_map.range_search(range, Lsn(100));
|
||||
assert!(res.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1045,7 +1033,7 @@ mod tests {
|
||||
for start in 0..60 {
|
||||
for end in (start + 1)..60 {
|
||||
let range = Key::from_i128(start)..Key::from_i128(end);
|
||||
let result = layer_map.range_search(range.clone(), Lsn(100));
|
||||
let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
|
||||
let expected = brute_force_range_search(&layer_map, range, Lsn(100));
|
||||
|
||||
assert_range_search_result_eq(result, expected);
|
||||
|
||||
@@ -2,12 +2,11 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -101,7 +100,7 @@ pub(crate) enum TenantsMap {
|
||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
||||
/// New tenants can be added using [`tenant_map_acquire_slot`].
|
||||
Open(BTreeMap<TenantShardId, TenantSlot>),
|
||||
/// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
|
||||
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
|
||||
/// Existing tenants are still accessible, but no new tenants can be created.
|
||||
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
||||
}
|
||||
@@ -260,12 +259,6 @@ pub struct TenantManager {
|
||||
// See https://github.com/neondatabase/neon/issues/5796
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
resources: TenantSharedResources,
|
||||
|
||||
// Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
|
||||
// This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime),
|
||||
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
|
||||
// when the tenant detaches.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
fn emergency_generations(
|
||||
@@ -302,7 +295,7 @@ async fn init_load_generations(
|
||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
match client.re_attach().await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
@@ -625,14 +618,13 @@ pub async fn init_tenant_mgr(
|
||||
conf,
|
||||
tenants: &TENANTS,
|
||||
resources,
|
||||
cancel: CancellationToken::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
|
||||
/// a broken tenant in the map if Tenant::spawn fails.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn tenant_spawn(
|
||||
pub(crate) fn tenant_spawn(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
tenant_path: &Utf8Path,
|
||||
@@ -686,6 +678,21 @@ fn tenant_spawn(
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
@@ -824,6 +831,40 @@ pub(crate) enum SetNewTenantConfigError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
// Legacy API: does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum UpsertLocationError {
|
||||
#[error("Bad config request: {0}")]
|
||||
@@ -1317,16 +1358,6 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option<TenantSlot> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
match &*locked {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
|
||||
map.get(&tenant_shard_id).cloned()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -1385,7 +1416,6 @@ impl TenantManager {
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1399,41 +1429,11 @@ impl TenantManager {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let r = self
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Validate the incoming request
|
||||
// Plan: identify what the new child shards will be
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
@@ -1442,18 +1442,10 @@ impl TenantManager {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
if tenant.get_shard_stripe_size() != new_stripe_size
|
||||
&& tenant_shard_id.shard_count.count() > 1
|
||||
{
|
||||
// This tenant already has multiple shards, it is illegal to try and change its stripe size
|
||||
anyhow::bail!(
|
||||
"Shard stripe size may not be modified once tenant has multiple shards"
|
||||
);
|
||||
}
|
||||
}
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let child_shards = tenant_shard_id.split(new_shard_count);
|
||||
tracing::info!(
|
||||
"Shard {} splits into: {}",
|
||||
@@ -1464,14 +1456,6 @@ impl TenantManager {
|
||||
.join(",")
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||
if let Err(e) = tenant.split_prepare(&child_shards).await {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
@@ -1481,10 +1465,6 @@ impl TenantManager {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
@@ -1506,16 +1486,11 @@ impl TenantManager {
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1530,9 +1505,6 @@ impl TenantManager {
|
||||
// Phase 3: Spawn the child shards
|
||||
for child_shard in &child_shards {
|
||||
let mut child_shard_identity = parent_shard_identity;
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
child_shard_identity.stripe_size = new_stripe_size;
|
||||
}
|
||||
child_shard_identity.count = child_shard.shard_count;
|
||||
child_shard_identity.number = child_shard.shard_number;
|
||||
|
||||
@@ -1555,10 +1527,6 @@ impl TenantManager {
|
||||
.await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
@@ -1591,10 +1559,6 @@ impl TenantManager {
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
@@ -1626,11 +1590,19 @@ impl TenantManager {
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
None,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
@@ -1661,9 +1633,9 @@ impl TenantManager {
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
@@ -1763,151 +1735,6 @@ impl TenantManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
shutdown_all_tenants0(self.tenants).await
|
||||
}
|
||||
|
||||
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
|
||||
/// the background, and thereby avoid blocking any API requests on this deletion completing.
|
||||
fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) async fn detach_tenant(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let tmp_path = self
|
||||
.detach_tenant0(
|
||||
conf,
|
||||
&TENANTS,
|
||||
tenant_shard_id,
|
||||
detach_ignored,
|
||||
deletion_queue_client,
|
||||
)
|
||||
.await?;
|
||||
self.spawn_background_purge(tmp_path);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn detach_tenant0(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<Utf8PathBuf, TenantStateError> {
|
||||
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} rename")
|
||||
})
|
||||
};
|
||||
|
||||
let removal_result = remove_tenant_from_memory(
|
||||
tenants,
|
||||
tenant_shard_id,
|
||||
tenant_dir_rename_operation(tenant_shard_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Flush pending deletions, so that they have a good chance of passing validation
|
||||
// before this tenant is potentially re-attached elsewhere.
|
||||
deletion_queue_client.flush_advisory();
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored
|
||||
&& matches!(
|
||||
removal_result,
|
||||
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
|
||||
)
|
||||
{
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Ignored tenant {tenant_shard_id} local directory rename")
|
||||
})?;
|
||||
return Ok(tmp_path);
|
||||
}
|
||||
}
|
||||
|
||||
removal_result
|
||||
}
|
||||
|
||||
pub(crate) async fn set_new_tenant_config(
|
||||
&self,
|
||||
new_tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), SetNewTenantConfigError> {
|
||||
// Legacy API: does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
// This is a legacy API that only operates on attached tenants: the preferred
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
tenant.set_new_tenant_config(new_tenant_conf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -2109,6 +1936,87 @@ pub(crate) enum TenantStateError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let tmp_path = detach_tenant0(
|
||||
conf,
|
||||
&TENANTS,
|
||||
tenant_shard_id,
|
||||
detach_ignored,
|
||||
deletion_queue_client,
|
||||
)
|
||||
.await?;
|
||||
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
|
||||
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
|
||||
let task_tenant_id = None;
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
task_tenant_id,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn detach_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<Utf8PathBuf, TenantStateError> {
|
||||
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
|
||||
};
|
||||
|
||||
let removal_result = remove_tenant_from_memory(
|
||||
tenants,
|
||||
tenant_shard_id,
|
||||
tenant_dir_rename_operation(tenant_shard_id),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Flush pending deletions, so that they have a good chance of passing validation
|
||||
// before this tenant is potentially re-attached elsewhere.
|
||||
deletion_queue_client.flush_advisory();
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored
|
||||
&& matches!(
|
||||
removal_result,
|
||||
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
|
||||
)
|
||||
{
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Ignored tenant {tenant_shard_id} local directory rename")
|
||||
})?;
|
||||
return Ok(tmp_path);
|
||||
}
|
||||
}
|
||||
|
||||
removal_result
|
||||
}
|
||||
|
||||
pub(crate) async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -2729,7 +2637,7 @@ use {
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
pub(crate) fn immediate_gc(
|
||||
pub(crate) async fn immediate_gc(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
@@ -2751,8 +2659,6 @@ pub(crate) fn immediate_gc(
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
|
||||
|
||||
// TODO: spawning is redundant now, need to hold the gate
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
@@ -2767,15 +2673,16 @@ pub(crate) fn immediate_gc(
|
||||
#[allow(unused_mut)]
|
||||
let mut result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
|
||||
.instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
// we need to synchronize with drop completion for python tests without polling for
|
||||
// log messages
|
||||
if let Ok(result) = result.as_mut() {
|
||||
// why not futures unordered? it seems it needs very much the same task structure
|
||||
// but would only run on single task.
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for layer in std::mem::take(&mut result.doomed_layers) {
|
||||
js.spawn(layer.wait_drop());
|
||||
@@ -2791,7 +2698,7 @@ pub(crate) fn immediate_gc(
|
||||
|
||||
if let Some(rtc) = rtc {
|
||||
// layer drops schedule actions on remote timeline client to actually do the
|
||||
// deletions; don't care about the shutdown error, just exit fast
|
||||
// deletions; don't care just exit fast about the shutdown error
|
||||
drop(rtc.wait_completion().await);
|
||||
}
|
||||
}
|
||||
@@ -2802,7 +2709,6 @@ pub(crate) fn immediate_gc(
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
.instrument(span)
|
||||
);
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
|
||||
@@ -14,16 +14,16 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use utils::backoff;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -50,8 +50,9 @@ pub async fn download_layer_file<'a>(
|
||||
) -> Result<u64, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
let local_path = conf
|
||||
.timeline_path(&tenant_shard_id, &timeline_id)
|
||||
.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = remote_layer_path(
|
||||
&tenant_shard_id.tenant_id,
|
||||
@@ -73,13 +74,55 @@ pub async fn download_layer_file<'a>(
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
@@ -87,6 +130,14 @@ pub async fn download_layer_file<'a>(
|
||||
)));
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
Err(DownloadError::Other(anyhow!(
|
||||
"remote-storage-download-pre-rename failpoint triggered"
|
||||
@@ -98,149 +149,16 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("rename download layer file to {local_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// We use fatal_err() below because the after the rename above,
|
||||
// the in-memory state of the filesystem already has the layer file in its final place,
|
||||
// and subsequent pageserver code could think it's durable while it really isn't.
|
||||
let work = async move {
|
||||
let timeline_dir = VirtualFile::open(&timeline_path)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
};
|
||||
crate::virtual_file::io_engine::get()
|
||||
.spawn_blocking_and_block_on_if_std(work)
|
||||
.await;
|
||||
crashsafe::fsync_async(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("fsync layer file {local_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
tracing::debug!("download complete: {local_path}");
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
|
||||
///
|
||||
/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
|
||||
/// (Note that the directory entry for the inode is not made durable.)
|
||||
/// The file size in bytes is returned.
|
||||
///
|
||||
/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
|
||||
/// The unlinking has _not_ been made durable.
|
||||
async fn download_object<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let res = match crate::virtual_file::io_engine::get() {
|
||||
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||
crate::virtual_file::io_engine::IoEngine::StdFs => {
|
||||
async {
|
||||
let destination_file = tokio::fs::File::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(src_path, cancel).await?;
|
||||
|
||||
let mut buf_writer =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
|
||||
buf_writer.flush().await?;
|
||||
|
||||
let mut destination_file = buf_writer.into_inner();
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
|
||||
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
|
||||
async {
|
||||
let destination_file = VirtualFile::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(src_path, cancel).await?;
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<
|
||||
{ super::BUFFER_SIZE },
|
||||
_,
|
||||
>::new(size_tracking);
|
||||
while let Some(res) =
|
||||
futures::StreamExt::next(&mut download.download_stream).await
|
||||
{
|
||||
let chunk = match res {
|
||||
Ok(chunk) => chunk,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
|
||||
.await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner().await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
}
|
||||
.await?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
// in case the download failed, clean up
|
||||
match res {
|
||||
Ok(bytes_amount) => Ok(bytes_amount),
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(dst_path).await {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
|
||||
}
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
|
||||
@@ -95,11 +95,7 @@ pub(crate) struct SecondaryTenant {
|
||||
shard_identity: ShardIdentity,
|
||||
tenant_conf: std::sync::Mutex<TenantConfOpt>,
|
||||
|
||||
// Internal state used by the Downloader.
|
||||
detail: std::sync::Mutex<SecondaryDetail>,
|
||||
|
||||
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
@@ -122,8 +118,6 @@ impl SecondaryTenant {
|
||||
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
||||
|
||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||
|
||||
progress: std::sync::Mutex::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -253,12 +247,9 @@ impl SecondaryTenant {
|
||||
}
|
||||
|
||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||
/// and heatmap uploads. This is not a hot data path: it's used for:
|
||||
/// - Live migrations, where we want to ensure a migration destination has the freshest possible
|
||||
/// content before trying to cut over.
|
||||
/// - Tests, where we want to immediately upload/download for a particular tenant.
|
||||
///
|
||||
/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
|
||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||
/// uploads & downloads are autonomous and not driven by this interface.
|
||||
pub struct SecondaryController {
|
||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||
|
||||
@@ -41,16 +41,14 @@ use crate::tenant::{
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||
id::TimelineId,
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -130,7 +128,6 @@ pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -141,26 +138,11 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||
datetime.format("%d/%m/%Y %T")
|
||||
}
|
||||
|
||||
/// Information returned from download function when it detects the heatmap has changed
|
||||
struct HeatMapModified {
|
||||
etag: Etag,
|
||||
last_modified: SystemTime,
|
||||
bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
enum HeatMapDownload {
|
||||
// The heatmap's etag has changed: return the new etag, mtime and the body bytes
|
||||
Modified(HeatMapModified),
|
||||
// The heatmap's etag is unchanged
|
||||
Unmodified,
|
||||
}
|
||||
|
||||
impl SecondaryDetail {
|
||||
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -495,31 +477,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
};
|
||||
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_etag = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_etag
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
let HeatMapModified {
|
||||
last_modified: heatmap_mtime,
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
let heatmap_bytes = tokio::select!(
|
||||
bytes = self.download_heatmap() => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
tracing::info!("Heatmap unchanged since last successful download");
|
||||
return Ok(());
|
||||
}
|
||||
HeatMapDownload::Modified(m) => m,
|
||||
};
|
||||
);
|
||||
|
||||
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||
|
||||
@@ -534,27 +496,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
|
||||
tracing::debug!(
|
||||
"Wrote local heatmap to {}, with {} timelines",
|
||||
heatmap_path,
|
||||
heatmap.timelines.len()
|
||||
);
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
{
|
||||
*self.secondary_state.progress.lock().unwrap() =
|
||||
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||
}
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!(
|
||||
"Cancelled before downloading timeline {}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -569,159 +515,30 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Do any fast local cleanup that comes before the much slower process of downloading
|
||||
/// layers from remote storage. In the process, initialize the SecondaryProgress object
|
||||
/// that will later be updated incrementally as we download layers.
|
||||
async fn prepare_timelines(
|
||||
&self,
|
||||
heatmap: &HeatMapTenant,
|
||||
heatmap_mtime: SystemTime,
|
||||
) -> Result<SecondaryProgress, UpdateError> {
|
||||
let heatmap_stats = heatmap.get_stats();
|
||||
// We will construct a progress object, and then populate its initial "downloaded" numbers
|
||||
// while iterating through local layer state in [`Self::prepare_timelines`]
|
||||
let mut progress = SecondaryProgress {
|
||||
layers_total: heatmap_stats.layers,
|
||||
bytes_total: heatmap_stats.bytes,
|
||||
heatmap_mtime: Some(heatmap_mtime),
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
for (timeline_id, timeline_state) in &mut detail.timelines {
|
||||
let Some(heatmap_timeline_index) = heatmap
|
||||
.timelines
|
||||
.iter()
|
||||
.position(|t| t.timeline_id == *timeline_id)
|
||||
else {
|
||||
// This timeline is no longer referenced in the heatmap: delete it locally
|
||||
delete_timelines.push(*timeline_id);
|
||||
continue;
|
||||
};
|
||||
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let mut layer_count = layers_on_disk.len();
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size())
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
layer_count -= 1;
|
||||
layer_byte_count -= timeline_state
|
||||
.on_disk_layers
|
||||
.get(layer)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
|
||||
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||
}
|
||||
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
progress.layers_downloaded += layer_count;
|
||||
}
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
for (timeline_id, layer_name) in delete_layers {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
let local_path = timeline_path.join(layer_name.to_string());
|
||||
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
|
||||
// Update in-memory housekeeping to reflect the absence of the deleted layer
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
timeline_state.on_disk_layers.remove(&layer_name);
|
||||
}
|
||||
|
||||
for timeline_id in delete_timelines {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
tracing::info!(timeline_id=%timeline_id,
|
||||
"Timeline no longer in heatmap, removing from secondary location"
|
||||
);
|
||||
tokio::fs::remove_dir_all(&timeline_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary timeline")?;
|
||||
}
|
||||
|
||||
Ok(progress)
|
||||
}
|
||||
|
||||
/// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
|
||||
/// still matches `prev_etag`.
|
||||
async fn download_heatmap(
|
||||
&self,
|
||||
prev_etag: Option<&Etag>,
|
||||
) -> Result<HeatMapDownload, UpdateError> {
|
||||
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
// TODO: pull up etag check into the request, to do a conditional GET rather than
|
||||
// issuing a GET and then maybe ignoring the response body
|
||||
// TODO: make download conditional on ETag having changed since last download
|
||||
// (https://github.com/neondatabase/neon/issues/6199)
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
backoff::retry(
|
||||
let heatmap_bytes = backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
|
||||
if Some(&download.etag) == prev_etag {
|
||||
Ok(HeatMapDownload::Unmodified)
|
||||
} else {
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||
etag: download.etag,
|
||||
last_modified: download.last_modified,
|
||||
bytes: heatmap_bytes,
|
||||
}))
|
||||
}
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
Ok(heatmap_bytes)
|
||||
},
|
||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
@@ -731,7 +548,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
.and_then(|x| x)
|
||||
.and_then(|x| x)?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
Ok(heatmap_bytes)
|
||||
}
|
||||
|
||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||
@@ -772,13 +593,31 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
let layers_in_heatmap = timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
let local_path = timeline_path.join(layer.to_string());
|
||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
}
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -823,12 +662,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
@@ -868,11 +701,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
} else {
|
||||
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.bytes_downloaded += downloaded_bytes;
|
||||
progress.layers_downloaded += 1;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
|
||||
@@ -62,25 +62,3 @@ impl HeatMapTimeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
pub(crate) bytes: u64,
|
||||
pub(crate) layers: usize,
|
||||
}
|
||||
|
||||
impl HeatMapTenant {
|
||||
pub(crate) fn get_stats(&self) -> HeatMapStats {
|
||||
let mut stats = HeatMapStats {
|
||||
bytes: 0,
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in &timeline.layers {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
}
|
||||
|
||||
stats
|
||||
}
|
||||
}
|
||||
|
||||
@@ -300,7 +300,6 @@ where
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
tracing::info!("Command already running, waiting for it");
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
|
||||
@@ -183,13 +183,7 @@ pub(super) async fn gather_inputs(
|
||||
// new gc run, which we have no control over. however differently from `Timeline::gc`
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
//
|
||||
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
|
||||
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let mut next_gc_cutoff = gc_info.pitr_cutoff;
|
||||
let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
|
||||
@@ -20,7 +20,6 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
@@ -428,7 +427,7 @@ impl LayerAccessStatFullDetails {
|
||||
} = self;
|
||||
pageserver_api::models::LayerAccessStatFullDetails {
|
||||
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
|
||||
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
|
||||
task_kind: task_kind.into(), // into static str, powered by strum_macros
|
||||
access_kind: *access_kind,
|
||||
}
|
||||
}
|
||||
@@ -526,7 +525,7 @@ impl LayerAccessStats {
|
||||
.collect(),
|
||||
task_kind_access_flag: task_kind_flag
|
||||
.iter()
|
||||
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
|
||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
|
||||
@@ -46,7 +46,6 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -848,33 +847,10 @@ impl DeltaLayerInner {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let planner = VectoredReadPlanner::new(
|
||||
self.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into(),
|
||||
);
|
||||
|
||||
let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
let reads = Self::plan_reads(
|
||||
keyspace,
|
||||
lsn_range,
|
||||
data_end_offset,
|
||||
index_reader,
|
||||
planner,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
let reads = self
|
||||
.plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
.map_err(GetVectoredError::Other)?;
|
||||
|
||||
self.do_reads_and_update_state(reads, reconstruct_state)
|
||||
.await;
|
||||
@@ -882,64 +858,73 @@ impl DeltaLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn plan_reads<Reader>(
|
||||
async fn plan_reads(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
data_end_offset: u64,
|
||||
index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
|
||||
mut planner: VectoredReadPlanner,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<VectoredRead>>
|
||||
where
|
||||
Reader: BlockReader,
|
||||
{
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build();
|
||||
) -> anyhow::Result<Vec<VectoredRead>> {
|
||||
let mut planner = VectoredReadPlanner::new(
|
||||
self.max_vectored_read_bytes
|
||||
.expect("Layer is loaded with max vectored bytes config")
|
||||
.0
|
||||
.into(),
|
||||
);
|
||||
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
|
||||
let index_stream = index_reader.get_stream_from(&start_key.0, &ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
tree_reader
|
||||
.visit(
|
||||
&start_key.0,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, value| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
let (raw_key, value) = index_entry?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
|
||||
let blob_ref = BlobRef(value);
|
||||
assert!(key >= range.start && lsn >= lsn_range.start);
|
||||
|
||||
// Lsns are not monotonically increasing across keys, so we don't assert on them.
|
||||
assert!(key >= range.start);
|
||||
let cached_lsn = reconstruct_state.get_cached_lsn(&key);
|
||||
let flag = {
|
||||
if cached_lsn >= Some(lsn) {
|
||||
BlobFlag::Ignore
|
||||
} else if blob_ref.will_init() {
|
||||
BlobFlag::Replaces
|
||||
} else {
|
||||
BlobFlag::None
|
||||
}
|
||||
};
|
||||
|
||||
let outside_lsn_range = !lsn_range.contains(&lsn);
|
||||
let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn);
|
||||
|
||||
let flag = {
|
||||
if outside_lsn_range || below_cached_lsn {
|
||||
BlobFlag::Ignore
|
||||
} else if blob_ref.will_init() {
|
||||
BlobFlag::ReplaceAll
|
||||
} else {
|
||||
// Usual path: add blob to the read
|
||||
BlobFlag::None
|
||||
}
|
||||
};
|
||||
|
||||
if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
|
||||
planner.handle_range_end(blob_ref.pos());
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag);
|
||||
}
|
||||
}
|
||||
if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
|
||||
planner.handle_range_end(blob_ref.pos());
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, lsn, blob_ref.pos(), flag);
|
||||
true
|
||||
}
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
|
||||
if !range_end_handled {
|
||||
tracing::info!("Handling range end fallback at {}", data_end_offset);
|
||||
planner.handle_range_end(data_end_offset);
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
tracing::info!("Handling range end fallback at {}", payload_end);
|
||||
planner.handle_range_end(payload_end);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1205,131 +1190,3 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
|
||||
};
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
/// verify that the traversal fed the right index key and value
|
||||
/// pairs into the planner.
|
||||
#[tokio::test]
|
||||
async fn test_delta_layer_index_traversal() {
|
||||
let base_key = Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
field3: 12972,
|
||||
field4: 16396,
|
||||
field5: 0,
|
||||
field6: 246080,
|
||||
};
|
||||
|
||||
// Populate the index with some entries
|
||||
let entries: BTreeMap<Key, Vec<Lsn>> = BTreeMap::from([
|
||||
(base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]),
|
||||
(base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
|
||||
(base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]),
|
||||
(base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]),
|
||||
]);
|
||||
|
||||
let mut disk = TestDisk::default();
|
||||
let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk);
|
||||
|
||||
let mut disk_offset = 0;
|
||||
for (key, lsns) in &entries {
|
||||
for lsn in lsns {
|
||||
let index_key = DeltaKey::from_key_lsn(key, *lsn);
|
||||
let blob_ref = BlobRef::new(disk_offset, false);
|
||||
writer
|
||||
.append(&index_key.0, blob_ref.0)
|
||||
.expect("In memory disk append should never fail");
|
||||
|
||||
disk_offset += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare all the arguments for the call into `plan_reads` below
|
||||
let (root_offset, _writer) = writer
|
||||
.finish()
|
||||
.expect("In memory disk finish should never fail");
|
||||
let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk);
|
||||
let planner = VectoredReadPlanner::new(100);
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![
|
||||
base_key..base_key.add(3),
|
||||
base_key.add(3)..base_key.add(100),
|
||||
],
|
||||
};
|
||||
let lsn_range = Lsn(2)..Lsn(40);
|
||||
|
||||
// Plan and validate
|
||||
let vectored_reads = DeltaLayerInner::plan_reads(
|
||||
keyspace.clone(),
|
||||
lsn_range.clone(),
|
||||
disk_offset,
|
||||
reader,
|
||||
planner,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.expect("Read planning should not fail");
|
||||
|
||||
validate(keyspace, lsn_range, vectored_reads, entries);
|
||||
}
|
||||
|
||||
fn validate(
|
||||
keyspace: KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
vectored_reads: Vec<VectoredRead>,
|
||||
index_entries: BTreeMap<Key, Vec<Lsn>>,
|
||||
) {
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
struct BlobSpec {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
at: u64,
|
||||
}
|
||||
|
||||
let mut planned_blobs = Vec::new();
|
||||
for read in vectored_reads {
|
||||
for (at, meta) in read.blobs_at.as_slice() {
|
||||
planned_blobs.push(BlobSpec {
|
||||
key: meta.key,
|
||||
lsn: meta.lsn,
|
||||
at: *at,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let mut expected_blobs = Vec::new();
|
||||
let mut disk_offset = 0;
|
||||
for (key, lsns) in index_entries {
|
||||
for lsn in lsns {
|
||||
let key_included = keyspace.ranges.iter().any(|range| range.contains(&key));
|
||||
let lsn_included = lsn_range.contains(&lsn);
|
||||
|
||||
if key_included && lsn_included {
|
||||
expected_blobs.push(BlobSpec {
|
||||
key,
|
||||
lsn,
|
||||
at: disk_offset,
|
||||
});
|
||||
}
|
||||
|
||||
disk_offset += 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(planned_blobs, expected_blobs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -55,7 +54,6 @@ use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -490,33 +488,35 @@ impl ImageLayerInner {
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
range.start.write_to_byte_slice(&mut search_key);
|
||||
|
||||
let index_stream = tree_reader.get_stream_from(&search_key, &ctx);
|
||||
let mut index_stream = std::pin::pin!(index_stream);
|
||||
tree_reader
|
||||
.visit(
|
||||
&search_key,
|
||||
VisitDirection::Forwards,
|
||||
|raw_key, offset| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
while let Some(index_entry) = index_stream.next().await {
|
||||
let (raw_key, offset) = index_entry?;
|
||||
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
assert!(key >= range.start);
|
||||
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
break;
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
}
|
||||
}
|
||||
if key >= range.end {
|
||||
planner.handle_range_end(offset);
|
||||
range_end_handled = true;
|
||||
false
|
||||
} else {
|
||||
planner.handle(key, self.lsn, offset, BlobFlag::None);
|
||||
true
|
||||
}
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
|
||||
if !range_end_handled {
|
||||
let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
|
||||
@@ -336,17 +336,32 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn put_values(
|
||||
&self,
|
||||
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
for (key, vals) in values {
|
||||
for (lsn, val) in vals {
|
||||
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
@@ -354,16 +369,22 @@ impl InMemoryLayer {
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
val: &Value,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
|
||||
let off = {
|
||||
// Avoid doing allocations for "small" values.
|
||||
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
|
||||
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
|
||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||
buf.clear();
|
||||
val.ser_into(&mut buf)?;
|
||||
locked_inner
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
&buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
@@ -391,12 +412,7 @@ impl InMemoryLayer {
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
self.start_lsn,
|
||||
end_lsn
|
||||
);
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user