mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
Compare commits
83 Commits
conrad/rem
...
conrad/fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc66f78d01 | ||
|
|
f9e6802974 | ||
|
|
74afc9d96f | ||
|
|
86fe3150f0 | ||
|
|
52be0146d3 | ||
|
|
a3f2a2cae5 | ||
|
|
a24a0032ad | ||
|
|
70cb02742a | ||
|
|
a845295cb3 | ||
|
|
e288cd2198 | ||
|
|
ffa9e595b8 | ||
|
|
e7b1f63f68 | ||
|
|
1dce2a9e74 | ||
|
|
ca88521653 | ||
|
|
07c3cfd2a0 | ||
|
|
7cd0066212 | ||
|
|
bf3a1529bf | ||
|
|
65d1be6e90 | ||
|
|
16eb8dda3d | ||
|
|
bb32f1b3d0 | ||
|
|
5585c32cee | ||
|
|
0ffdc98e20 | ||
|
|
62d844e657 | ||
|
|
1bb434ab74 | ||
|
|
dbde37c53a | ||
|
|
5e3cb2ab07 | ||
|
|
61f267d8f9 | ||
|
|
e2411818ef | ||
|
|
58327cbba8 | ||
|
|
568927a8a0 | ||
|
|
1ed7252950 | ||
|
|
30b57334ef | ||
|
|
d487ba2b9b | ||
|
|
e7a1d5de94 | ||
|
|
6be572177c | ||
|
|
fe7a4e1ab6 | ||
|
|
40cae8cc36 | ||
|
|
02fc8b7c70 | ||
|
|
60feb168e2 | ||
|
|
da596a5162 | ||
|
|
effd6bf829 | ||
|
|
a6e0baf31a | ||
|
|
19b74b8837 | ||
|
|
25718e324a | ||
|
|
ac8f44c70e | ||
|
|
d09664f039 | ||
|
|
6689d6fd89 | ||
|
|
33b400beae | ||
|
|
ca07f7dba5 | ||
|
|
b0dfe0ffa6 | ||
|
|
185ead8395 | ||
|
|
37e322438b | ||
|
|
fca2c32e59 | ||
|
|
d19aebcf12 | ||
|
|
a70a5bccff | ||
|
|
d9cedb4a95 | ||
|
|
b623fbae0c | ||
|
|
512210bb5a | ||
|
|
9eebd6fc79 | ||
|
|
11527b9df7 | ||
|
|
89554af1bd | ||
|
|
f391186aa7 | ||
|
|
94b41b531b | ||
|
|
d793088225 | ||
|
|
67ad420e26 | ||
|
|
90cd5a5be8 | ||
|
|
643448b1a2 | ||
|
|
8daebb6ed4 | ||
|
|
ab14521ea5 | ||
|
|
e82021d6fe | ||
|
|
9997661138 | ||
|
|
0e427fc117 | ||
|
|
9b2e6f862a | ||
|
|
12e87d7a9f | ||
|
|
a56afee269 | ||
|
|
9e6ca2932f | ||
|
|
63ea4b0579 | ||
|
|
20881ef65e | ||
|
|
a695713727 | ||
|
|
5c57e8a11b | ||
|
|
84a2556c9f | ||
|
|
761e9e0e1d | ||
|
|
94cb9a79d9 |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -31,7 +31,7 @@ config-variables:
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
- PREWARM_PGBENCH_SIZE
|
||||
- PREWARM_PROJECT_ID
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_CICD_CHANNEL_ID
|
||||
|
||||
384
.github/workflows/benchbase_tpcc.yml
vendored
Normal file
384
.github/workflows/benchbase_tpcc.yml
vendored
Normal file
@@ -0,0 +1,384 @@
|
||||
name: TPC-C like benchmark using benchbase
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 6 * * *' # run once a day at 6 AM UTC
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we do not want to be too noisy in production environment
|
||||
group: benchbase-tpcc-workflow
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
benchbase-tpcc:
|
||||
strategy:
|
||||
fail-fast: false # allow other variants to continue even if one fails
|
||||
matrix:
|
||||
include:
|
||||
- warehouses: 50 # defines number of warehouses and is used to compute number of terminals
|
||||
max_rate: 800 # measured max TPS at scale factor based on experiments. Adjust if performance is better/worse
|
||||
min_cu: 0.25 # simulate free tier plan (0.25 -2 CU)
|
||||
max_cu: 2
|
||||
- warehouses: 500 # serverless plan (2-8 CU)
|
||||
max_rate: 2000
|
||||
min_cu: 2
|
||||
max_cu: 8
|
||||
- warehouses: 1000 # business plan (2-16 CU)
|
||||
max_rate: 2900
|
||||
min_cu: 2
|
||||
max_cu: 16
|
||||
max-parallel: 1 # we want to run each workload size sequentially to avoid noisy neighbors
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PG_CONFIG: /tmp/neon/pg_install/v17/bin/pg_config
|
||||
PSQL: /tmp/neon/pg_install/v17/bin/psql
|
||||
PG_17_LIB_PATH: /tmp/neon/pg_install/v17/lib
|
||||
POSTGRES_VERSION: 17
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
timeout-minutes: 1440
|
||||
|
||||
steps:
|
||||
- name: Harden the runner (Audit all outbound calls)
|
||||
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials # necessary to download artefacts
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project-tpcc
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: aws-us-east-2
|
||||
postgres_version: ${{ env.POSTGRES_VERSION }}
|
||||
compute_units: '[${{ matrix.min_cu }}, ${{ matrix.max_cu }}]'
|
||||
api_key: ${{ secrets.NEON_PRODUCTION_API_KEY_4_BENCHMARKS }}
|
||||
api_host: console.neon.tech # production (!)
|
||||
|
||||
- name: Initialize Neon project
|
||||
env:
|
||||
BENCHMARK_TPCC_CONNSTR: ${{ steps.create-neon-project-tpcc.outputs.dsn }}
|
||||
PROJECT_ID: ${{ steps.create-neon-project-tpcc.outputs.project_id }}
|
||||
run: |
|
||||
echo "Initializing Neon project with project_id: ${PROJECT_ID}"
|
||||
export LD_LIBRARY_PATH=${PG_17_LIB_PATH}
|
||||
|
||||
# Retry logic for psql connection with 1 minute sleep between attempts
|
||||
for attempt in {1..3}; do
|
||||
echo "Attempt ${attempt}/3: Creating extensions in Neon project"
|
||||
if ${PSQL} "${BENCHMARK_TPCC_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"; then
|
||||
echo "Successfully created extensions"
|
||||
break
|
||||
else
|
||||
echo "Failed to create extensions on attempt ${attempt}"
|
||||
if [ ${attempt} -lt 3 ]; then
|
||||
echo "Waiting 60 seconds before retry..."
|
||||
sleep 60
|
||||
else
|
||||
echo "All attempts failed, exiting"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "BENCHMARK_TPCC_CONNSTR=${BENCHMARK_TPCC_CONNSTR}" >> $GITHUB_ENV
|
||||
|
||||
- name: Generate BenchBase workload configuration
|
||||
env:
|
||||
WAREHOUSES: ${{ matrix.warehouses }}
|
||||
MAX_RATE: ${{ matrix.max_rate }}
|
||||
run: |
|
||||
echo "Generating BenchBase configs for warehouses: ${WAREHOUSES}, max_rate: ${MAX_RATE}"
|
||||
|
||||
# Extract hostname and password from connection string
|
||||
# Format: postgresql://username:password@hostname/database?params (no port for Neon)
|
||||
HOSTNAME=$(echo "${BENCHMARK_TPCC_CONNSTR}" | sed -n 's|.*://[^:]*:[^@]*@\([^/]*\)/.*|\1|p')
|
||||
PASSWORD=$(echo "${BENCHMARK_TPCC_CONNSTR}" | sed -n 's|.*://[^:]*:\([^@]*\)@.*|\1|p')
|
||||
|
||||
echo "Extracted hostname: ${HOSTNAME}"
|
||||
|
||||
# Use runner temp (NVMe) as working directory
|
||||
cd "${RUNNER_TEMP}"
|
||||
|
||||
# Copy the generator script
|
||||
cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/generate_workload_size.py" .
|
||||
|
||||
# Generate configs and scripts
|
||||
python3 generate_workload_size.py \
|
||||
--warehouses ${WAREHOUSES} \
|
||||
--max-rate ${MAX_RATE} \
|
||||
--hostname ${HOSTNAME} \
|
||||
--password ${PASSWORD} \
|
||||
--runner-arch ${{ runner.arch }}
|
||||
|
||||
# Fix path mismatch: move generated configs and scripts to expected locations
|
||||
mv ../configs ./configs
|
||||
mv ../scripts ./scripts
|
||||
|
||||
- name: Prepare database (load data)
|
||||
env:
|
||||
WAREHOUSES: ${{ matrix.warehouses }}
|
||||
run: |
|
||||
cd "${RUNNER_TEMP}"
|
||||
|
||||
echo "Loading ${WAREHOUSES} warehouses into database..."
|
||||
|
||||
# Run the loader script and capture output to log file while preserving stdout/stderr
|
||||
./scripts/load_${WAREHOUSES}_warehouses.sh 2>&1 | tee "load_${WAREHOUSES}_warehouses.log"
|
||||
|
||||
echo "Database loading completed"
|
||||
|
||||
- name: Run TPC-C benchmark (warmup phase, then benchmark at 70% of configuredmax TPS)
|
||||
env:
|
||||
WAREHOUSES: ${{ matrix.warehouses }}
|
||||
run: |
|
||||
cd "${RUNNER_TEMP}"
|
||||
|
||||
echo "Running TPC-C benchmark with ${WAREHOUSES} warehouses..."
|
||||
|
||||
# Run the optimal rate benchmark
|
||||
./scripts/execute_${WAREHOUSES}_warehouses_opt_rate.sh
|
||||
|
||||
echo "Benchmark execution completed"
|
||||
|
||||
- name: Run TPC-C benchmark (warmup phase, then ramp down TPS and up again in 5 minute intervals)
|
||||
|
||||
env:
|
||||
WAREHOUSES: ${{ matrix.warehouses }}
|
||||
run: |
|
||||
cd "${RUNNER_TEMP}"
|
||||
|
||||
echo "Running TPC-C ramp-down-up with ${WAREHOUSES} warehouses..."
|
||||
|
||||
# Run the optimal rate benchmark
|
||||
./scripts/execute_${WAREHOUSES}_warehouses_ramp_up.sh
|
||||
|
||||
echo "Benchmark execution completed"
|
||||
|
||||
- name: Process results (upload to test results database and generate diagrams)
|
||||
env:
|
||||
WAREHOUSES: ${{ matrix.warehouses }}
|
||||
MIN_CU: ${{ matrix.min_cu }}
|
||||
MAX_CU: ${{ matrix.max_cu }}
|
||||
PROJECT_ID: ${{ steps.create-neon-project-tpcc.outputs.project_id }}
|
||||
REVISION: ${{ github.sha }}
|
||||
PERF_DB_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
cd "${RUNNER_TEMP}"
|
||||
|
||||
echo "Creating temporary Python environment for results processing..."
|
||||
|
||||
# Create temporary virtual environment
|
||||
python3 -m venv temp_results_env
|
||||
source temp_results_env/bin/activate
|
||||
|
||||
# Install required packages in virtual environment
|
||||
pip install matplotlib pandas psycopg2-binary
|
||||
|
||||
echo "Copying results processing scripts..."
|
||||
|
||||
# Copy both processing scripts
|
||||
cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/generate_diagrams.py" .
|
||||
cp "${GITHUB_WORKSPACE}/test_runner/performance/benchbase_tpc_c_helpers/upload_results_to_perf_test_results.py" .
|
||||
|
||||
echo "Processing load phase metrics..."
|
||||
|
||||
# Find and process load log
|
||||
LOAD_LOG=$(find . -name "load_${WAREHOUSES}_warehouses.log" -type f | head -1)
|
||||
if [ -n "$LOAD_LOG" ]; then
|
||||
echo "Processing load metrics from: $LOAD_LOG"
|
||||
python upload_results_to_perf_test_results.py \
|
||||
--load-log "$LOAD_LOG" \
|
||||
--run-type "load" \
|
||||
--warehouses "${WAREHOUSES}" \
|
||||
--min-cu "${MIN_CU}" \
|
||||
--max-cu "${MAX_CU}" \
|
||||
--project-id "${PROJECT_ID}" \
|
||||
--revision "${REVISION}" \
|
||||
--connection-string "${PERF_DB_CONNSTR}"
|
||||
else
|
||||
echo "Warning: Load log file not found: load_${WAREHOUSES}_warehouses.log"
|
||||
fi
|
||||
|
||||
echo "Processing warmup results for optimal rate..."
|
||||
|
||||
# Find and process warmup results
|
||||
WARMUP_CSV=$(find results_warmup -name "*.results.csv" -type f | head -1)
|
||||
WARMUP_JSON=$(find results_warmup -name "*.summary.json" -type f | head -1)
|
||||
|
||||
if [ -n "$WARMUP_CSV" ] && [ -n "$WARMUP_JSON" ]; then
|
||||
echo "Generating warmup diagram from: $WARMUP_CSV"
|
||||
python generate_diagrams.py \
|
||||
--input-csv "$WARMUP_CSV" \
|
||||
--output-svg "warmup_${WAREHOUSES}_warehouses_performance.svg" \
|
||||
--title-suffix "Warmup at max TPS"
|
||||
|
||||
echo "Uploading warmup metrics from: $WARMUP_JSON"
|
||||
python upload_results_to_perf_test_results.py \
|
||||
--summary-json "$WARMUP_JSON" \
|
||||
--results-csv "$WARMUP_CSV" \
|
||||
--run-type "warmup" \
|
||||
--min-cu "${MIN_CU}" \
|
||||
--max-cu "${MAX_CU}" \
|
||||
--project-id "${PROJECT_ID}" \
|
||||
--revision "${REVISION}" \
|
||||
--connection-string "${PERF_DB_CONNSTR}"
|
||||
else
|
||||
echo "Warning: Missing warmup results files (CSV: $WARMUP_CSV, JSON: $WARMUP_JSON)"
|
||||
fi
|
||||
|
||||
echo "Processing optimal rate results..."
|
||||
|
||||
# Find and process optimal rate results
|
||||
OPTRATE_CSV=$(find results_opt_rate -name "*.results.csv" -type f | head -1)
|
||||
OPTRATE_JSON=$(find results_opt_rate -name "*.summary.json" -type f | head -1)
|
||||
|
||||
if [ -n "$OPTRATE_CSV" ] && [ -n "$OPTRATE_JSON" ]; then
|
||||
echo "Generating optimal rate diagram from: $OPTRATE_CSV"
|
||||
python generate_diagrams.py \
|
||||
--input-csv "$OPTRATE_CSV" \
|
||||
--output-svg "benchmark_${WAREHOUSES}_warehouses_performance.svg" \
|
||||
--title-suffix "70% of max TPS"
|
||||
|
||||
echo "Uploading optimal rate metrics from: $OPTRATE_JSON"
|
||||
python upload_results_to_perf_test_results.py \
|
||||
--summary-json "$OPTRATE_JSON" \
|
||||
--results-csv "$OPTRATE_CSV" \
|
||||
--run-type "opt-rate" \
|
||||
--min-cu "${MIN_CU}" \
|
||||
--max-cu "${MAX_CU}" \
|
||||
--project-id "${PROJECT_ID}" \
|
||||
--revision "${REVISION}" \
|
||||
--connection-string "${PERF_DB_CONNSTR}"
|
||||
else
|
||||
echo "Warning: Missing optimal rate results files (CSV: $OPTRATE_CSV, JSON: $OPTRATE_JSON)"
|
||||
fi
|
||||
|
||||
echo "Processing warmup 2 results for ramp down/up phase..."
|
||||
|
||||
# Find and process warmup results
|
||||
WARMUP_CSV=$(find results_warmup -name "*.results.csv" -type f | tail -1)
|
||||
WARMUP_JSON=$(find results_warmup -name "*.summary.json" -type f | tail -1)
|
||||
|
||||
if [ -n "$WARMUP_CSV" ] && [ -n "$WARMUP_JSON" ]; then
|
||||
echo "Generating warmup diagram from: $WARMUP_CSV"
|
||||
python generate_diagrams.py \
|
||||
--input-csv "$WARMUP_CSV" \
|
||||
--output-svg "warmup_2_${WAREHOUSES}_warehouses_performance.svg" \
|
||||
--title-suffix "Warmup at max TPS"
|
||||
|
||||
echo "Uploading warmup metrics from: $WARMUP_JSON"
|
||||
python upload_results_to_perf_test_results.py \
|
||||
--summary-json "$WARMUP_JSON" \
|
||||
--results-csv "$WARMUP_CSV" \
|
||||
--run-type "warmup" \
|
||||
--min-cu "${MIN_CU}" \
|
||||
--max-cu "${MAX_CU}" \
|
||||
--project-id "${PROJECT_ID}" \
|
||||
--revision "${REVISION}" \
|
||||
--connection-string "${PERF_DB_CONNSTR}"
|
||||
else
|
||||
echo "Warning: Missing warmup results files (CSV: $WARMUP_CSV, JSON: $WARMUP_JSON)"
|
||||
fi
|
||||
|
||||
echo "Processing ramp results..."
|
||||
|
||||
# Find and process ramp results
|
||||
RAMPUP_CSV=$(find results_ramp_up -name "*.results.csv" -type f | head -1)
|
||||
RAMPUP_JSON=$(find results_ramp_up -name "*.summary.json" -type f | head -1)
|
||||
|
||||
if [ -n "$RAMPUP_CSV" ] && [ -n "$RAMPUP_JSON" ]; then
|
||||
echo "Generating ramp diagram from: $RAMPUP_CSV"
|
||||
python generate_diagrams.py \
|
||||
--input-csv "$RAMPUP_CSV" \
|
||||
--output-svg "ramp_${WAREHOUSES}_warehouses_performance.svg" \
|
||||
--title-suffix "ramp TPS down and up in 5 minute intervals"
|
||||
|
||||
echo "Uploading ramp metrics from: $RAMPUP_JSON"
|
||||
python upload_results_to_perf_test_results.py \
|
||||
--summary-json "$RAMPUP_JSON" \
|
||||
--results-csv "$RAMPUP_CSV" \
|
||||
--run-type "ramp-up" \
|
||||
--min-cu "${MIN_CU}" \
|
||||
--max-cu "${MAX_CU}" \
|
||||
--project-id "${PROJECT_ID}" \
|
||||
--revision "${REVISION}" \
|
||||
--connection-string "${PERF_DB_CONNSTR}"
|
||||
else
|
||||
echo "Warning: Missing ramp results files (CSV: $RAMPUP_CSV, JSON: $RAMPUP_JSON)"
|
||||
fi
|
||||
|
||||
# Deactivate and clean up virtual environment
|
||||
deactivate
|
||||
rm -rf temp_results_env
|
||||
rm upload_results_to_perf_test_results.py
|
||||
|
||||
echo "Results processing completed and environment cleaned up"
|
||||
|
||||
- name: Set date for upload
|
||||
id: set-date
|
||||
run: echo "date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials # necessary to upload results
|
||||
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
|
||||
with:
|
||||
aws-region: us-east-2
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 900 # 900 is minimum value
|
||||
|
||||
- name: Upload benchmark results to S3
|
||||
env:
|
||||
S3_BUCKET: neon-public-benchmark-results
|
||||
S3_PREFIX: benchbase-tpc-c/${{ steps.set-date.outputs.date }}/${{ github.run_id }}/${{ matrix.warehouses }}-warehouses
|
||||
run: |
|
||||
echo "Redacting passwords from configuration files before upload..."
|
||||
|
||||
# Mask all passwords in XML config files
|
||||
find "${RUNNER_TEMP}/configs" -name "*.xml" -type f -exec sed -i 's|<password>[^<]*</password>|<password>redacted</password>|g' {} \;
|
||||
|
||||
echo "Uploading benchmark results to s3://${S3_BUCKET}/${S3_PREFIX}/"
|
||||
|
||||
# Upload the entire benchmark directory recursively
|
||||
aws s3 cp --only-show-errors --recursive "${RUNNER_TEMP}" s3://${S3_BUCKET}/${S3_PREFIX}/
|
||||
|
||||
echo "Upload completed"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project-tpcc.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_PRODUCTION_API_KEY_4_BENCHMARKS }}
|
||||
api_host: console.neon.tech # production (!)
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -418,7 +418,7 @@ jobs:
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
|
||||
PROJECT_ID: ${{ vars.PREWARM_PROJECT_ID }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 17
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
@@ -146,7 +146,9 @@ jobs:
|
||||
with:
|
||||
file: build-tools/Dockerfile
|
||||
context: .
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
build-args: |
|
||||
|
||||
12
.github/workflows/build_and_test.yml
vendored
12
.github/workflows/build_and_test.yml
vendored
@@ -634,7 +634,9 @@ jobs:
|
||||
DEBIAN_VERSION=bookworm
|
||||
secrets: |
|
||||
SUBZERO_ACCESS_TOKEN=${{ secrets.CI_ACCESS_TOKEN }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
@@ -747,7 +749,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
@@ -766,7 +770,9 @@ jobs:
|
||||
PG_VERSION=${{ matrix.version.pg }}
|
||||
BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
attests: |
|
||||
type=provenance,mode=max
|
||||
type=sbom,generator=docker.io/docker/buildkit-syft-scanner:1
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
|
||||
28
.github/workflows/pg-clients.yml
vendored
28
.github/workflows/pg-clients.yml
vendored
@@ -48,8 +48,20 @@ jobs:
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
secrets: inherit
|
||||
|
||||
generate-ch-tmppw:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
tmp_val: ${{ steps.pwgen.outputs.tmp_val }}
|
||||
steps:
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
run: |
|
||||
set +x
|
||||
p=$(dd if=/dev/random bs=14 count=1 2>/dev/null | base64)
|
||||
echo tmp_val="${p//\//}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
test-logical-replication:
|
||||
needs: [ build-build-tools-image ]
|
||||
needs: [ build-build-tools-image, generate-ch-tmppw ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
@@ -60,16 +72,21 @@ jobs:
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.6.3.64
|
||||
image: clickhouse/clickhouse-server:25.6
|
||||
env:
|
||||
CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }}
|
||||
PGSSLCERT: /tmp/postgresql.crt
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
image: quay.io/debezium/zookeeper:3.1.3.Final
|
||||
ports:
|
||||
- 2181:2181
|
||||
- 2888:2888
|
||||
- 3888:3888
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
image: quay.io/debezium/kafka:3.1.3.Final
|
||||
env:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
@@ -79,7 +96,7 @@ jobs:
|
||||
ports:
|
||||
- 9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
image: quay.io/debezium/connect:3.1.3.Final
|
||||
env:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
@@ -125,6 +142,7 @@ jobs:
|
||||
aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
CLICKHOUSE_PASSWORD: ${{ needs.generate-ch-tmppw.outputs.tmp_val }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
|
||||
48
.github/workflows/proxy-benchmark.yml
vendored
48
.github/workflows/proxy-benchmark.yml
vendored
@@ -3,7 +3,7 @@ name: Periodic proxy performance test on unit-perf-aws-arm runners
|
||||
on:
|
||||
push: # TODO: remove after testing
|
||||
branches:
|
||||
- test-proxy-bench # Runs on pushes to branches starting with test-proxy-bench
|
||||
- test-proxy-bench # Runs on pushes to test-proxy-bench branch
|
||||
# schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
@@ -32,7 +32,7 @@ jobs:
|
||||
statuses: write
|
||||
contents: write
|
||||
pull-requests: write
|
||||
runs-on: [self-hosted, unit-perf-aws-arm]
|
||||
runs-on: [ self-hosted, unit-perf-aws-arm ]
|
||||
timeout-minutes: 60 # 1h timeout
|
||||
container:
|
||||
image: ghcr.io/neondatabase/build-tools:pinned-bookworm
|
||||
@@ -55,30 +55,58 @@ jobs:
|
||||
{
|
||||
echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH"
|
||||
echo "NEON_DIR=${RUNNER_TEMP}/neon"
|
||||
echo "NEON_PROXY_PATH=${RUNNER_TEMP}/neon/bin/proxy"
|
||||
echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output"
|
||||
echo ""
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Run proxy-bench
|
||||
run: ${PROXY_BENCH_PATH}/run.sh
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Ingest Bench Results # neon repo script
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: show ulimits
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
ulimit -a
|
||||
|
||||
- name: Run proxy-bench
|
||||
working-directory: ${{ env.PROXY_BENCH_PATH }}
|
||||
run: ./run.sh --with-grafana --bare-metal
|
||||
|
||||
- name: Ingest Bench Results
|
||||
if: always()
|
||||
working-directory: ${{ env.NEON_DIR }}
|
||||
run: |
|
||||
mkdir -p $TEST_OUTPUT
|
||||
python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
|
||||
|
||||
- name: Push Metrics to Proxy perf database
|
||||
shell: bash -euxo pipefail {0}
|
||||
if: always()
|
||||
env:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
|
||||
REPORT_FROM: $TEST_OUTPUT
|
||||
working-directory: ${{ env.NEON_DIR }}
|
||||
run: $NEON_DIR/scripts/generate_and_push_perf_report.sh
|
||||
|
||||
- name: Docker cleanup
|
||||
if: always()
|
||||
run: docker compose down
|
||||
|
||||
- name: Notify Failure
|
||||
if: failure()
|
||||
run: echo "Proxy bench job failed" && exit 1
|
||||
run: echo "Proxy bench job failed" && exit 1
|
||||
|
||||
- name: Cleanup Test Resources
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Cleanup the test resources
|
||||
if [[ -d "${TEST_OUTPUT}" ]]; then
|
||||
rm -rf ${TEST_OUTPUT}
|
||||
fi
|
||||
if [[ -d "${PROXY_BENCH_PATH}/test_output" ]]; then
|
||||
rm -rf ${PROXY_BENCH_PATH}/test_output
|
||||
fi
|
||||
209
Cargo.lock
generated
209
Cargo.lock
generated
@@ -211,11 +211,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-lock"
|
||||
version = "3.2.0"
|
||||
version = "3.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
|
||||
checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18"
|
||||
dependencies = [
|
||||
"event-listener 4.0.0",
|
||||
"event-listener 5.4.0",
|
||||
"event-listener-strategy",
|
||||
"pin-project-lite",
|
||||
]
|
||||
@@ -1388,6 +1388,7 @@ dependencies = [
|
||||
"tower-http",
|
||||
"tower-otel",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
@@ -1403,9 +1404,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "concurrent-queue"
|
||||
version = "2.3.0"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
|
||||
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
@@ -2231,9 +2232,9 @@ checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "4.0.0"
|
||||
version = "5.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
|
||||
checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"parking",
|
||||
@@ -2242,11 +2243,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "event-listener-strategy"
|
||||
version = "0.4.0"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
|
||||
checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93"
|
||||
dependencies = [
|
||||
"event-listener 4.0.0",
|
||||
"event-listener 5.4.0",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
@@ -2515,6 +2516,20 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
|
||||
|
||||
[[package]]
|
||||
name = "generator"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"log",
|
||||
"rustversion",
|
||||
"windows 0.61.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -2833,7 +2848,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"windows",
|
||||
"windows 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3104,7 +3119,7 @@ dependencies = [
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
"windows-core 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3655,6 +3670,19 @@ version = "0.4.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
|
||||
|
||||
[[package]]
|
||||
name = "loom"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"generator",
|
||||
"scoped-tls",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.3"
|
||||
@@ -3871,6 +3899,25 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moka"
|
||||
version = "0.12.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9321642ca94a4282428e6ea4af8cc2ca4eac48ac7a6a4ea8f33f76d0ce70926"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"loom",
|
||||
"parking_lot 0.12.1",
|
||||
"portable-atomic",
|
||||
"rustc_version",
|
||||
"smallvec",
|
||||
"tagptr",
|
||||
"thiserror 1.0.69",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "multimap"
|
||||
version = "0.8.3"
|
||||
@@ -5030,8 +5077,6 @@ dependencies = [
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"env_logger",
|
||||
"log",
|
||||
"memoffset 0.9.0",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi_types",
|
||||
@@ -5384,7 +5429,6 @@ dependencies = [
|
||||
"futures",
|
||||
"gettid",
|
||||
"hashbrown 0.14.5",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hostname",
|
||||
@@ -5406,6 +5450,7 @@ dependencies = [
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
"moka",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"ouroboros",
|
||||
@@ -5472,6 +5517,7 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
"zerocopy 0.8.24",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6419,6 +6465,12 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scoped-tls"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.1.0"
|
||||
@@ -7268,6 +7320,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tagptr"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.40"
|
||||
@@ -7934,11 +7992,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-appender"
|
||||
version = "0.2.2"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e"
|
||||
checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"thiserror 1.0.69",
|
||||
"time",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
@@ -8636,10 +8695,32 @@ version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
|
||||
dependencies = [
|
||||
"windows-core",
|
||||
"windows-core 0.52.0",
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows"
|
||||
version = "0.61.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893"
|
||||
dependencies = [
|
||||
"windows-collections",
|
||||
"windows-core 0.61.2",
|
||||
"windows-future",
|
||||
"windows-link",
|
||||
"windows-numerics",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-collections"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
|
||||
dependencies = [
|
||||
"windows-core 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.52.0"
|
||||
@@ -8649,6 +8730,86 @@ dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.61.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
|
||||
dependencies = [
|
||||
"windows-implement",
|
||||
"windows-interface",
|
||||
"windows-link",
|
||||
"windows-result",
|
||||
"windows-strings",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-future"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e"
|
||||
dependencies = [
|
||||
"windows-core 0.61.2",
|
||||
"windows-link",
|
||||
"windows-threading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-implement"
|
||||
version = "0.60.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-interface"
|
||||
version = "0.59.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
|
||||
|
||||
[[package]]
|
||||
name = "windows-numerics"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
|
||||
dependencies = [
|
||||
"windows-core 0.61.2",
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-result"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-strings"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
@@ -8707,6 +8868,15 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-threading"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.48.0"
|
||||
@@ -8843,6 +9013,8 @@ dependencies = [
|
||||
"clap",
|
||||
"clap_builder",
|
||||
"const-oid",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"crypto-bigint 0.5.5",
|
||||
"der 0.7.8",
|
||||
"deranged",
|
||||
@@ -8888,6 +9060,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"p256 0.13.2",
|
||||
"parquet",
|
||||
"portable-atomic",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost 0.13.5",
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -46,10 +46,10 @@ members = [
|
||||
"libs/proxy/json",
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/subzero_core",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
"proxy/subzero_core",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -135,7 +135,7 @@ lock_api = "0.4.13"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.9"
|
||||
moka = { version = "0.12", features = ["sync"] }
|
||||
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
|
||||
# Do not update to >= 7.0.0, at least. The update will have a significant impact
|
||||
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
|
||||
@@ -145,7 +145,7 @@ num-traits = "0.2.19"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.30"
|
||||
opentelemetry_sdk = "0.30"
|
||||
opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] }
|
||||
opentelemetry-semantic-conventions = "0.30"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
@@ -222,6 +222,7 @@ tracing-log = "0.2"
|
||||
tracing-opentelemetry = "0.31"
|
||||
tracing-serde = "0.2.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
tracing-appender = "0.2.3"
|
||||
try-lock = "0.2.5"
|
||||
test-log = { version = "0.2.17", default-features = false, features = ["log"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
@@ -232,9 +233,10 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.8"
|
||||
whoami = "1.5.1"
|
||||
zerocopy = { version = "0.8", features = ["derive", "simd"] }
|
||||
json-structural-diff = { version = "0.2.0" }
|
||||
x509-cert = { version = "0.2.5" }
|
||||
zerocopy = { version = "0.8", features = ["derive", "simd"] }
|
||||
zeroize = "1.8"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.11"
|
||||
|
||||
@@ -103,7 +103,7 @@ RUN --mount=type=secret,uid=1000,id=SUBZERO_ACCESS_TOKEN \
|
||||
&& if [ -s /run/secrets/SUBZERO_ACCESS_TOKEN ]; then \
|
||||
export CARGO_FEATURES="rest_broker"; \
|
||||
fi \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo auditable build \
|
||||
--features $CARGO_FEATURES \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
|
||||
@@ -39,13 +39,13 @@ COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates wget gpg && \
|
||||
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
|
||||
apt-get update && \
|
||||
apt install -y --no-install-recommends \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
autotools-dev \
|
||||
libedit-dev \
|
||||
@@ -89,8 +89,7 @@ RUN useradd -ms /bin/bash nonroot -b /home
|
||||
# Use strict mode for bash to catch errors early
|
||||
SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
|
||||
|
||||
RUN mkdir -p /pgcopydb/bin && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
RUN mkdir -p /pgcopydb/{bin,lib} && \
|
||||
chmod -R 755 /pgcopydb && \
|
||||
chown -R nonroot:nonroot /pgcopydb
|
||||
|
||||
@@ -106,8 +105,8 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
# regression tests
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
autoconf \
|
||||
automake \
|
||||
bison \
|
||||
@@ -183,22 +182,22 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
||||
ENV LLVM_VERSION=20
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install node
|
||||
ENV NODE_VERSION=24
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \
|
||||
&& apt install -y nodejs \
|
||||
&& apt-get install -y --no-install-recommends nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install docker
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
|
||||
&& apt update \
|
||||
&& apt install -y docker-ce docker-ce-cli \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends docker-ce docker-ce-cli \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Configure sudo & docker
|
||||
@@ -215,12 +214,11 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION=v2.37.1
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& git clone -b "${MOLD_VERSION}" --depth 1 https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
&& cd mold/build \
|
||||
&& git checkout ${MOLD_VERSION} \
|
||||
&& cd mold/build \
|
||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
||||
&& cmake --build . -j $(nproc) \
|
||||
&& cmake --build . -j "$(nproc)" \
|
||||
&& cmake --install . \
|
||||
&& cd .. \
|
||||
&& rm -rf mold
|
||||
@@ -254,7 +252,7 @@ ENV ICU_VERSION=67.1
|
||||
ENV ICU_PREFIX=/usr/local/icu
|
||||
|
||||
# Download and build static ICU
|
||||
RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
|
||||
RUN wget -O "/tmp/libicu-${ICU_VERSION}.tgz" https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
|
||||
echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \
|
||||
mkdir /tmp/icu && \
|
||||
pushd /tmp/icu && \
|
||||
@@ -265,8 +263,7 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re
|
||||
make install && \
|
||||
popd && \
|
||||
rm -rf icu && \
|
||||
rm -f /tmp/libicu-${ICU_VERSION}.tgz && \
|
||||
popd
|
||||
rm -f /tmp/libicu-${ICU_VERSION}.tgz
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
@@ -279,19 +276,19 @@ ENV PYTHON_VERSION=3.11.12 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
&& cd $HOME \
|
||||
&& cd "$HOME" \
|
||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
||||
&& chmod +x pyenv-installer \
|
||||
&& ./pyenv-installer \
|
||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
||||
&& pyenv install ${PYTHON_VERSION} \
|
||||
&& pyenv global ${PYTHON_VERSION} \
|
||||
&& pyenv install "${PYTHON_VERSION}" \
|
||||
&& pyenv global "${PYTHON_VERSION}" \
|
||||
&& python --version \
|
||||
&& pip install --upgrade pip \
|
||||
&& pip install --no-cache-dir --upgrade pip \
|
||||
&& pip --version \
|
||||
&& pip install pipenv wheel poetry
|
||||
&& pip install --no-cache-dir pipenv wheel poetry
|
||||
|
||||
# Switch to nonroot user (again)
|
||||
USER nonroot:nonroot
|
||||
@@ -302,6 +299,7 @@ WORKDIR /home/nonroot
|
||||
ENV RUSTC_VERSION=1.88.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG CARGO_AUDITABLE_VERSION=0.7.0
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
ARG CARGO_HAKARI_VERSION=0.9.36
|
||||
ARG CARGO_DENY_VERSION=0.18.2
|
||||
@@ -317,14 +315,16 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --locked --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --locked --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --locked --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \
|
||||
cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
|
||||
cargo install diesel_cli --locked --version ${CARGO_DIESEL_CLI_VERSION} \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
cargo install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" && \
|
||||
cargo auditable install cargo-auditable --locked --version "${CARGO_AUDITABLE_VERSION}" --force && \
|
||||
cargo auditable install rustfilt --version "${RUSTFILT_VERSION}" && \
|
||||
cargo auditable install cargo-hakari --locked --version "${CARGO_HAKARI_VERSION}" && \
|
||||
cargo auditable install cargo-deny --locked --version "${CARGO_DENY_VERSION}" && \
|
||||
cargo auditable install cargo-hack --locked --version "${CARGO_HACK_VERSION}" && \
|
||||
cargo auditable install cargo-nextest --locked --version "${CARGO_NEXTEST_VERSION}" && \
|
||||
cargo auditable install cargo-chef --locked --version "${CARGO_CHEF_VERSION}" && \
|
||||
cargo auditable install diesel_cli --locked --version "${CARGO_DIESEL_CLI_VERSION}" \
|
||||
--features postgres-bundled --no-default-features && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
|
||||
|
||||
51
build-tools/package-lock.json
generated
51
build-tools/package-lock.json
generated
@@ -6,7 +6,7 @@
|
||||
"": {
|
||||
"name": "build-tools",
|
||||
"devDependencies": {
|
||||
"@redocly/cli": "1.34.4",
|
||||
"@redocly/cli": "1.34.5",
|
||||
"@sourcemeta/jsonschema": "10.0.0"
|
||||
}
|
||||
},
|
||||
@@ -472,9 +472,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@redocly/cli": {
|
||||
"version": "1.34.4",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.4.tgz",
|
||||
"integrity": "sha512-seH/GgrjSB1EeOsgJ/4Ct6Jk2N7sh12POn/7G8UQFARMyUMJpe1oHtBwT2ndfp4EFCpgBAbZ/82Iw6dwczNxEA==",
|
||||
"version": "1.34.5",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.5.tgz",
|
||||
"integrity": "sha512-5IEwxs7SGP5KEXjBKLU8Ffdz9by/KqNSeBk6YUVQaGxMXK//uYlTJIPntgUXbo1KAGG2d2q2XF8y4iFz6qNeiw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -484,14 +484,14 @@
|
||||
"@opentelemetry/sdk-trace-node": "1.26.0",
|
||||
"@opentelemetry/semantic-conventions": "1.27.0",
|
||||
"@redocly/config": "^0.22.0",
|
||||
"@redocly/openapi-core": "1.34.4",
|
||||
"@redocly/respect-core": "1.34.4",
|
||||
"@redocly/openapi-core": "1.34.5",
|
||||
"@redocly/respect-core": "1.34.5",
|
||||
"abort-controller": "^3.0.0",
|
||||
"chokidar": "^3.5.1",
|
||||
"colorette": "^1.2.0",
|
||||
"core-js": "^3.32.1",
|
||||
"dotenv": "16.4.7",
|
||||
"form-data": "^4.0.0",
|
||||
"form-data": "^4.0.4",
|
||||
"get-port-please": "^3.0.1",
|
||||
"glob": "^7.1.6",
|
||||
"handlebars": "^4.7.6",
|
||||
@@ -522,9 +522,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@redocly/openapi-core": {
|
||||
"version": "1.34.4",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.4.tgz",
|
||||
"integrity": "sha512-hf53xEgpXIgWl3b275PgZU3OTpYh1RoD2LHdIfQ1JzBNTWsiNKczTEsI/4Tmh2N1oq9YcphhSMyk3lDh85oDjg==",
|
||||
"version": "1.34.5",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.5.tgz",
|
||||
"integrity": "sha512-0EbE8LRbkogtcCXU7liAyC00n9uNG9hJ+eMyHFdUsy9lB/WGqnEBgwjA9q2cyzAVcdTkQqTBBU1XePNnN3OijA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -544,21 +544,21 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@redocly/respect-core": {
|
||||
"version": "1.34.4",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.4.tgz",
|
||||
"integrity": "sha512-MitKyKyQpsizA4qCVv+MjXL4WltfhFQAoiKiAzrVR1Kusro3VhYb6yJuzoXjiJhR0ukLP5QOP19Vcs7qmj9dZg==",
|
||||
"version": "1.34.5",
|
||||
"resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.5.tgz",
|
||||
"integrity": "sha512-GheC/g/QFztPe9UA9LamooSplQuy9pe0Yr8XGTqkz0ahivLDl7svoy/LSQNn1QH3XGtLKwFYMfTwFR2TAYyh5Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@faker-js/faker": "^7.6.0",
|
||||
"@redocly/ajv": "8.11.2",
|
||||
"@redocly/openapi-core": "1.34.4",
|
||||
"@redocly/openapi-core": "1.34.5",
|
||||
"better-ajv-errors": "^1.2.0",
|
||||
"colorette": "^2.0.20",
|
||||
"concat-stream": "^2.0.0",
|
||||
"cookie": "^0.7.2",
|
||||
"dotenv": "16.4.7",
|
||||
"form-data": "4.0.0",
|
||||
"form-data": "^4.0.4",
|
||||
"jest-diff": "^29.3.1",
|
||||
"jest-matcher-utils": "^29.3.1",
|
||||
"js-yaml": "4.1.0",
|
||||
@@ -582,21 +582,6 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@redocly/respect-core/node_modules/form-data": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
|
||||
"integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"asynckit": "^0.4.0",
|
||||
"combined-stream": "^1.0.8",
|
||||
"mime-types": "^2.1.12"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 6"
|
||||
}
|
||||
},
|
||||
"node_modules/@sinclair/typebox": {
|
||||
"version": "0.27.8",
|
||||
"resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz",
|
||||
@@ -1345,9 +1330,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/form-data": {
|
||||
"version": "4.0.3",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz",
|
||||
"integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==",
|
||||
"version": "4.0.4",
|
||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz",
|
||||
"integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"name": "build-tools",
|
||||
"private": true,
|
||||
"devDependencies": {
|
||||
"@redocly/cli": "1.34.4",
|
||||
"@redocly/cli": "1.34.5",
|
||||
"@sourcemeta/jsonschema": "10.0.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,7 +133,7 @@ RUN case $DEBIAN_VERSION in \
|
||||
# Install newer version (3.25) from backports.
|
||||
# libstdc++-10-dev is required for plv8
|
||||
bullseye) \
|
||||
echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
|
||||
echo "deb http://archive.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
|
||||
VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \
|
||||
;; \
|
||||
# Version-specific installs for Bookworm (PG17):
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
commit 5eb393810cf7c7bafa4e394dad2e349e2a8cb2cb
|
||||
Author: Alexey Masterov <alexey.masterov@databricks.com>
|
||||
Date: Mon Jul 28 18:11:02 2025 +0200
|
||||
|
||||
Patch for pg_repack
|
||||
|
||||
diff --git a/regress/Makefile b/regress/Makefile
|
||||
index bf6edcb..89b4c7f 100644
|
||||
index bf6edcb..110e734 100644
|
||||
--- a/regress/Makefile
|
||||
+++ b/regress/Makefile
|
||||
@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
|
||||
@@ -7,18 +13,36 @@ index bf6edcb..89b4c7f 100644
|
||||
#
|
||||
|
||||
-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
|
||||
+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
|
||||
+REGRESS := init-extension noautovacuum repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger autovacuum
|
||||
|
||||
USE_PGXS = 1 # use pgxs if not in contrib directory
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/expected/init-extension.out
|
||||
+++ b/regress/expected/init-extension.out
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/expected/autovacuum.out b/regress/expected/autovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..e7f2363
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/autovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/noautovacuum.out b/regress/expected/noautovacuum.out
|
||||
new file mode 100644
|
||||
index 0000000..fc7978e
|
||||
--- /dev/null
|
||||
+++ b/regress/expected/noautovacuum.out
|
||||
@@ -0,0 +1,7 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
+ pg_reload_conf
|
||||
+----------------
|
||||
+ t
|
||||
+(1 row)
|
||||
+
|
||||
diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
|
||||
index 8d0a94e..63b68bf 100644
|
||||
--- a/regress/expected/nosuper.out
|
||||
@@ -50,14 +74,22 @@ index 8d0a94e..63b68bf 100644
|
||||
INFO: repacking table "public.tbl_cluster"
|
||||
ERROR: query failed: ERROR: current transaction is aborted, commands ignored until end of transaction block
|
||||
DETAIL: query was: RESET lock_timeout
|
||||
diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
|
||||
index 9f2e171..f6e4f8d 100644
|
||||
--- a/regress/sql/init-extension.sql
|
||||
+++ b/regress/sql/init-extension.sql
|
||||
@@ -1,3 +1,2 @@
|
||||
SET client_min_messages = warning;
|
||||
CREATE EXTENSION pg_repack;
|
||||
-RESET client_min_messages;
|
||||
diff --git a/regress/sql/autovacuum.sql b/regress/sql/autovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..a8eda63
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/autovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='on';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/noautovacuum.sql b/regress/sql/noautovacuum.sql
|
||||
new file mode 100644
|
||||
index 0000000..13d4836
|
||||
--- /dev/null
|
||||
+++ b/regress/sql/noautovacuum.sql
|
||||
@@ -0,0 +1,2 @@
|
||||
+ALTER SYSTEM SET autovacuum='off';
|
||||
+SELECT pg_reload_conf();
|
||||
diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
|
||||
index 072f0fa..dbe60f8 100644
|
||||
--- a/regress/sql/nosuper.sql
|
||||
|
||||
@@ -26,7 +26,13 @@ commands:
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
# Turn off database collector (`--no-collector.database`), we don't use `pg_database_size_bytes` metric anyway, see
|
||||
# https://github.com/neondatabase/flux-fleet/blob/5e19b3fd897667b70d9a7ad4aa06df0ca22b49ff/apps/base/compute-metrics/scrape-compute-pg-exporter-neon.yaml#L29
|
||||
# but it's enabled by default and it doesn't filter out invalid databases, see
|
||||
# https://github.com/prometheus-community/postgres_exporter/blob/06a553c8166512c9d9c5ccf257b0f9bba8751dbc/collector/pg_database.go#L67
|
||||
# so if it hits one, it starts spamming logs
|
||||
# ERROR: [NEON_SMGR] [reqid d9700000018] could not read db size of db 705302 from page server at lsn 5/A2457EB0
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --no-collector.database --config.file=/etc/postgres_exporter.yml'
|
||||
- name: pgbouncer-exporter
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -26,7 +26,13 @@ commands:
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
# Turn off database collector (`--no-collector.database`), we don't use `pg_database_size_bytes` metric anyway, see
|
||||
# https://github.com/neondatabase/flux-fleet/blob/5e19b3fd897667b70d9a7ad4aa06df0ca22b49ff/apps/base/compute-metrics/scrape-compute-pg-exporter-neon.yaml#L29
|
||||
# but it's enabled by default and it doesn't filter out invalid databases, see
|
||||
# https://github.com/prometheus-community/postgres_exporter/blob/06a553c8166512c9d9c5ccf257b0f9bba8751dbc/collector/pg_database.go#L67
|
||||
# so if it hits one, it starts spamming logs
|
||||
# ERROR: [NEON_SMGR] [reqid d9700000018] could not read db size of db 705302 from page server at lsn 5/A2457EB0
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --no-collector.database --config.file=/etc/postgres_exporter.yml'
|
||||
- name: pgbouncer-exporter
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -62,6 +62,7 @@ tokio-stream.workspace = true
|
||||
tonic.workspace = true
|
||||
tower-otel.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-appender.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
|
||||
@@ -52,8 +52,17 @@ stateDiagram-v2
|
||||
Init --> Running : Started Postgres
|
||||
Running --> TerminationPendingFast : Requested termination
|
||||
Running --> TerminationPendingImmediate : Requested termination
|
||||
Running --> ConfigurationPending : Received a /configure request with spec
|
||||
Running --> RefreshConfigurationPending : Received a /refresh_configuration request, compute node will pull a new spec and reconfigure
|
||||
RefreshConfigurationPending --> RefreshConfiguration: Received compute spec and started configuration
|
||||
RefreshConfiguration --> Running : Compute has been re-configured
|
||||
RefreshConfiguration --> RefreshConfigurationPending : Configuration failed and to be retried
|
||||
Running --> Reloading : Local changes (TLS certificate renewal) were detected and postgres is being reloaded
|
||||
Reloading --> Running : Postgres was reloaded
|
||||
Reloading --> Failed : Failed to reload postgres
|
||||
TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
|
||||
TerminationPendingImmediate --> Terminated : Terminated compute immediately
|
||||
Failed --> RefreshConfigurationPending : Received a /refresh_configuration request
|
||||
Failed --> [*] : Compute exited
|
||||
Terminated --> [*] : Compute exited
|
||||
```
|
||||
|
||||
@@ -49,9 +49,10 @@ use compute_tools::compute::{
|
||||
BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
|
||||
};
|
||||
use compute_tools::extension_server::get_pg_version_string;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_isready::get_pg_isready_bin;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::{hadron_metrics, installed_extensions, logger::*};
|
||||
use rlimit::{Resource, setrlimit};
|
||||
use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
|
||||
use signal_hook::iterator::Signals;
|
||||
@@ -81,6 +82,15 @@ struct Cli {
|
||||
#[arg(long, default_value_t = 3081)]
|
||||
pub internal_http_port: u16,
|
||||
|
||||
/// Backwards-compatible --http-port for Hadron deployments. Functionally the
|
||||
/// same as --external-http-port.
|
||||
#[arg(
|
||||
long,
|
||||
conflicts_with = "external_http_port",
|
||||
conflicts_with = "internal_http_port"
|
||||
)]
|
||||
pub http_port: Option<u16>,
|
||||
|
||||
#[arg(short = 'D', long, value_name = "DATADIR")]
|
||||
pub pgdata: String,
|
||||
|
||||
@@ -180,6 +190,26 @@ impl Cli {
|
||||
}
|
||||
}
|
||||
|
||||
// Hadron helpers to get compatible compute_ctl http ports from Cli. The old `--http-port`
|
||||
// arg is used and acts the same as `--external-http-port`. The internal http port is defined
|
||||
// to be http_port + 1. Hadron runs in the dblet environment which uses the host network, so
|
||||
// we need to be careful with the ports to choose.
|
||||
fn get_external_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli.http_port.unwrap_or(cli.external_http_port);
|
||||
}
|
||||
cli.external_http_port
|
||||
}
|
||||
fn get_internal_http_port(cli: &Cli) -> u16 {
|
||||
if cli.lakebase_mode {
|
||||
return cli
|
||||
.http_port
|
||||
.map(|p| p + 1)
|
||||
.unwrap_or(cli.internal_http_port);
|
||||
}
|
||||
cli.internal_http_port
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -194,15 +224,28 @@ fn main() -> Result<()> {
|
||||
.build()?;
|
||||
let _rt_guard = runtime.enter();
|
||||
|
||||
let tracing_provider = init(cli.dev)?;
|
||||
let mut log_dir = None;
|
||||
if cli.lakebase_mode {
|
||||
log_dir = std::env::var("COMPUTE_CTL_LOG_DIRECTORY").ok();
|
||||
}
|
||||
|
||||
let (tracing_provider, _file_logs_guard) = init(cli.dev, log_dir)?;
|
||||
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
if cli.lakebase_mode {
|
||||
installed_extensions::initialize_metrics();
|
||||
hadron_metrics::initialize_metrics();
|
||||
}
|
||||
|
||||
let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
|
||||
|
||||
let config = get_config(&cli)?;
|
||||
|
||||
let external_http_port = get_external_http_port(&cli);
|
||||
let internal_http_port = get_internal_http_port(&cli);
|
||||
|
||||
let compute_node = ComputeNode::new(
|
||||
ComputeNodeParams {
|
||||
compute_id: cli.compute_id,
|
||||
@@ -211,8 +254,8 @@ fn main() -> Result<()> {
|
||||
pgdata: cli.pgdata.clone(),
|
||||
pgbin: cli.pgbin.clone(),
|
||||
pgversion: get_pg_version_string(&cli.pgbin),
|
||||
external_http_port: cli.external_http_port,
|
||||
internal_http_port: cli.internal_http_port,
|
||||
external_http_port,
|
||||
internal_http_port,
|
||||
remote_ext_base_url: cli.remote_ext_base_url.clone(),
|
||||
resize_swap_on_bind: cli.resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
|
||||
@@ -226,7 +269,12 @@ fn main() -> Result<()> {
|
||||
cli.installed_extensions_collection_interval,
|
||||
)),
|
||||
pg_init_timeout: cli.pg_init_timeout.map(Duration::from_secs),
|
||||
pg_isready_bin: get_pg_isready_bin(&cli.pgbin),
|
||||
instance_id: std::env::var("INSTANCE_ID").ok(),
|
||||
lakebase_mode: cli.lakebase_mode,
|
||||
build_tag: BUILD_TAG.to_string(),
|
||||
control_plane_uri: cli.control_plane_uri,
|
||||
config_path_test_only: cli.config,
|
||||
},
|
||||
config,
|
||||
)?;
|
||||
@@ -238,8 +286,14 @@ fn main() -> Result<()> {
|
||||
deinit_and_exit(tracing_provider, exit_code);
|
||||
}
|
||||
|
||||
fn init(dev_mode: bool) -> Result<Option<tracing_utils::Provider>> {
|
||||
let provider = init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
fn init(
|
||||
dev_mode: bool,
|
||||
log_dir: Option<String>,
|
||||
) -> Result<(
|
||||
Option<tracing_utils::Provider>,
|
||||
Option<tracing_appender::non_blocking::WorkerGuard>,
|
||||
)> {
|
||||
let (provider, file_logs_guard) = init_tracing_and_logging(DEFAULT_LOG_LEVEL, &log_dir)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
@@ -250,7 +304,7 @@ fn init(dev_mode: bool) -> Result<Option<tracing_utils::Provider>> {
|
||||
|
||||
info!("compute build_tag: {}", &BUILD_TAG.to_string());
|
||||
|
||||
Ok(provider)
|
||||
Ok((provider, file_logs_guard))
|
||||
}
|
||||
|
||||
fn get_config(cli: &Cli) -> Result<ComputeConfig> {
|
||||
|
||||
@@ -6,7 +6,8 @@ use compute_api::responses::{
|
||||
LfcPrewarmState, PromoteState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, GenericOption,
|
||||
PageserverConnectionInfo, PageserverProtocol, PgIdent, Role,
|
||||
};
|
||||
use futures::StreamExt;
|
||||
use futures::future::join_all;
|
||||
@@ -21,12 +22,13 @@ use postgres::NoTls;
|
||||
use postgres::error::SqlState;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::ffi::OsString;
|
||||
use std::os::unix::fs::{PermissionsExt, symlink};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex, RwLock};
|
||||
use std::sync::{Arc, Condvar, Mutex, MutexGuard, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{env, fs};
|
||||
use tokio::{spawn, sync::watch, task::JoinHandle, time};
|
||||
@@ -36,12 +38,13 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
use utils::pid_file;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
use utils::shard::{ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::configurator::launch_configurator;
|
||||
use crate::disk_quota::set_disk_quota;
|
||||
use crate::hadron_metrics::COMPUTE_ATTACHED;
|
||||
use crate::installed_extensions::get_installed_extensions;
|
||||
use crate::logger::startup_context_from_env;
|
||||
use crate::logger::{self, startup_context_from_env};
|
||||
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use crate::metrics::COMPUTE_CTL_UP;
|
||||
use crate::monitor::launch_monitor;
|
||||
@@ -54,7 +57,6 @@ use crate::rsyslog::{
|
||||
use crate::spec::*;
|
||||
use crate::swap::resize_swap;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::tls::watch_cert_for_changes;
|
||||
use crate::{config, extension_server, local_proxy};
|
||||
|
||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||
@@ -113,11 +115,17 @@ pub struct ComputeNodeParams {
|
||||
|
||||
/// Interval for installed extensions collection
|
||||
pub installed_extensions_collection_interval: Arc<AtomicU64>,
|
||||
|
||||
/// Hadron instance ID of the compute node.
|
||||
pub instance_id: Option<String>,
|
||||
/// Timeout of PG compute startup in the Init state.
|
||||
pub pg_init_timeout: Option<Duration>,
|
||||
|
||||
// Path to the `pg_isready` binary.
|
||||
pub pg_isready_bin: String,
|
||||
pub lakebase_mode: bool,
|
||||
|
||||
pub build_tag: String,
|
||||
pub control_plane_uri: Option<String>,
|
||||
pub config_path_test_only: Option<OsString>,
|
||||
}
|
||||
|
||||
type TaskHandle = Mutex<Option<JoinHandle<()>>>;
|
||||
@@ -240,7 +248,7 @@ pub struct ParsedSpec {
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
/// k8s dns name and port
|
||||
@@ -288,25 +296,47 @@ impl ParsedSpec {
|
||||
}
|
||||
|
||||
impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
type Error = String;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
|
||||
type Error = anyhow::Error;
|
||||
fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
|
||||
// Extract the options from the spec file that are needed to connect to
|
||||
// the storage system.
|
||||
//
|
||||
// For backwards-compatibility, the top-level fields in the spec file
|
||||
// may be empty. In that case, we need to dig them from the GUCs in the
|
||||
// cluster.settings field.
|
||||
let pageserver_connstr = spec
|
||||
.pageserver_connstring
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
// In compute specs generated by old control plane versions, the spec file might
|
||||
// be missing the `pageserver_connection_info` field. In that case, we need to dig
|
||||
// the pageserver connection info from the `pageserver_connstr` field instead, or
|
||||
// if that's missing too, from the GUC in the cluster.settings field.
|
||||
let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
|
||||
pageserver_conninfo = Some(PageserverConnectionInfo::from_connstr(
|
||||
pageserver_connstr_field,
|
||||
spec.shard_stripe_size,
|
||||
)?);
|
||||
}
|
||||
}
|
||||
if pageserver_conninfo.is_none() {
|
||||
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
|
||||
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
|
||||
{
|
||||
Some(ShardStripeSize(u32::from_str(&guc)?))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
pageserver_conninfo =
|
||||
Some(PageserverConnectionInfo::from_connstr(&guc, stripe_size)?);
|
||||
}
|
||||
}
|
||||
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
|
||||
"pageserver connection information should be provided"
|
||||
))?;
|
||||
|
||||
// Similarly for safekeeper connection strings
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
@@ -321,22 +351,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.ok_or("tenant id should be provided")
|
||||
.map(|s| TenantId::from_str(&s))?
|
||||
.or(Err("invalid tenant id"))?
|
||||
.ok_or(anyhow::anyhow!("tenant id should be provided"))?;
|
||||
TenantId::from_str(&guc).context("invalid tenant id")?
|
||||
};
|
||||
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
|
||||
timeline_id
|
||||
} else {
|
||||
spec.cluster
|
||||
let guc = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.ok_or("timeline id should be provided")
|
||||
.map(|s| TimelineId::from_str(&s))?
|
||||
.or(Err("invalid timeline id"))?
|
||||
.ok_or(anyhow::anyhow!("timeline id should be provided"))?;
|
||||
TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
|
||||
};
|
||||
|
||||
let endpoint_storage_addr: Option<String> = spec
|
||||
@@ -350,7 +380,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
|
||||
let res = ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
pageserver_conninfo,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
@@ -360,7 +390,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
};
|
||||
|
||||
// Now check validity of the parsed specification
|
||||
res.validate()?;
|
||||
res.validate().map_err(anyhow::Error::msg)?;
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
@@ -405,6 +435,130 @@ struct StartVmMonitorResult {
|
||||
vm_monitor: Option<JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
/// This function creates roles that are used by Databricks.
|
||||
/// These roles are not needs to be botostrapped at PG Compute provisioning time.
|
||||
/// The auth method for these roles are configured in databricks_pg_hba.conf in universe repository.
|
||||
pub(crate) fn create_databricks_roles() -> Vec<String> {
|
||||
let roles = vec![
|
||||
// Role for prometheus_stats_exporter
|
||||
Role {
|
||||
name: "databricks_monitor".to_string(),
|
||||
// This uses "local" connection and auth method for that is "trust", so no password is needed.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "IN ROLE pg_monitor".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore control plane
|
||||
Role {
|
||||
name: "databricks_control_plane".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: Some(vec![GenericOption {
|
||||
name: "SUPERUSER".to_string(),
|
||||
value: None,
|
||||
vartype: "string".to_string(),
|
||||
}]),
|
||||
},
|
||||
// Role for brickstore httpgateway.
|
||||
Role {
|
||||
name: "databricks_gateway".to_string(),
|
||||
// Certificate user does not need password.
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
},
|
||||
];
|
||||
|
||||
roles
|
||||
.into_iter()
|
||||
.map(|role| {
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = '{}')
|
||||
THEN
|
||||
CREATE ROLE {} {};
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
role.name,
|
||||
role.name.pg_quote(),
|
||||
role.to_pg_options(),
|
||||
);
|
||||
query
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Databricks-specific environment variables to be passed to the `postgres` sub-process.
|
||||
pub struct DatabricksEnvVars {
|
||||
/// The Databricks "endpoint ID" of the compute instance. Used by `postgres` to check
|
||||
/// the token scopes of internal auth tokens.
|
||||
pub endpoint_id: String,
|
||||
/// Hostname of the Databricks workspace URL this compute instance belongs to.
|
||||
/// Used by postgres to verify Databricks PAT tokens.
|
||||
pub workspace_host: String,
|
||||
|
||||
pub lakebase_mode: bool,
|
||||
}
|
||||
|
||||
impl DatabricksEnvVars {
|
||||
pub fn new(
|
||||
compute_spec: &ComputeSpec,
|
||||
compute_id: Option<&String>,
|
||||
instance_id: Option<String>,
|
||||
lakebase_mode: bool,
|
||||
) -> Self {
|
||||
let endpoint_id = if let Some(instance_id) = instance_id {
|
||||
// Use instance_id as endpoint_id if it is set. This code path is for PuPr model.
|
||||
instance_id
|
||||
} else {
|
||||
// Use compute_id as endpoint_id if instance_id is not set. The code path is for PrPr model.
|
||||
// compute_id is a string format of "{endpoint_id}/{compute_idx}"
|
||||
// endpoint_id is a uuid. We only need to pass down endpoint_id to postgres.
|
||||
// Panics if compute_id is not set or not in the expected format.
|
||||
compute_id.unwrap().split('/').next().unwrap().to_string()
|
||||
};
|
||||
let workspace_host = compute_spec
|
||||
.databricks_settings
|
||||
.as_ref()
|
||||
.map(|s| s.databricks_workspace_host.clone())
|
||||
.unwrap_or("".to_string());
|
||||
Self {
|
||||
endpoint_id,
|
||||
workspace_host,
|
||||
lakebase_mode,
|
||||
}
|
||||
}
|
||||
|
||||
/// Constants for the names of Databricks-specific postgres environment variables.
|
||||
const DATABRICKS_ENDPOINT_ID_ENVVAR: &'static str = "DATABRICKS_ENDPOINT_ID";
|
||||
const DATABRICKS_WORKSPACE_HOST_ENVVAR: &'static str = "DATABRICKS_WORKSPACE_HOST";
|
||||
|
||||
/// Convert DatabricksEnvVars to a list of string pairs that can be passed as env vars. Consumes `self`.
|
||||
pub fn to_env_var_list(self) -> Vec<(String, String)> {
|
||||
if !self.lakebase_mode {
|
||||
// In neon env, we don't need to pass down the env vars to postgres.
|
||||
return vec![];
|
||||
}
|
||||
vec![
|
||||
(
|
||||
Self::DATABRICKS_ENDPOINT_ID_ENVVAR.to_string(),
|
||||
self.endpoint_id.clone(),
|
||||
),
|
||||
(
|
||||
Self::DATABRICKS_WORKSPACE_HOST_ENVVAR.to_string(),
|
||||
self.workspace_host.clone(),
|
||||
),
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
|
||||
let connstr = params.connstr.as_str();
|
||||
@@ -441,7 +595,11 @@ impl ComputeNode {
|
||||
let mut new_state = ComputeState::new();
|
||||
if let Some(spec) = config.spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
if params.lakebase_mode {
|
||||
ComputeNode::set_spec(¶ms, &mut new_state, pspec);
|
||||
} else {
|
||||
new_state.pspec = Some(pspec);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ComputeNode {
|
||||
@@ -486,6 +644,7 @@ impl ComputeNode {
|
||||
port: this.params.external_http_port,
|
||||
config: this.compute_ctl_config.clone(),
|
||||
compute_id: this.params.compute_id.clone(),
|
||||
instance_id: this.params.instance_id.clone(),
|
||||
}
|
||||
.launch(&this);
|
||||
|
||||
@@ -682,14 +841,11 @@ impl ComputeNode {
|
||||
let mut pre_tasks = tokio::task::JoinSet::new();
|
||||
|
||||
// Make sure TLS certificates are properly loaded and in the right place.
|
||||
if self.compute_ctl_config.tls.is_some() {
|
||||
let tls_task = self.compute_ctl_config.tls.as_ref().map(|tls_config| {
|
||||
let this = self.clone();
|
||||
pre_tasks.spawn(async move {
|
||||
this.watch_cert_for_changes().await;
|
||||
|
||||
Ok::<(), anyhow::Error>(())
|
||||
});
|
||||
}
|
||||
let tls_config = tls_config.clone();
|
||||
tokio::task::spawn_blocking(|| this.watch_cert_for_changes(tls_config))
|
||||
});
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
@@ -744,6 +900,13 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for TLS certificates to be issued before updating pgbouncer and local proxy.
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
if let Some(tls_task) = tls_task {
|
||||
rt.block_on(tls_task)
|
||||
.context("TLS certificate renewal task panicked")?;
|
||||
}
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
@@ -826,7 +989,6 @@ impl ComputeNode {
|
||||
let _configurator_handle = launch_configurator(self);
|
||||
|
||||
// Wait for all the pre-tasks to finish before starting postgres
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
while let Some(res) = rt.block_on(pre_tasks.join_next()) {
|
||||
res??;
|
||||
}
|
||||
@@ -1038,7 +1200,14 @@ impl ComputeNode {
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let pgdata = &self.params.pgdata;
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
fs::create_dir(pgdata)?;
|
||||
if self.params.lakebase_mode {
|
||||
// Ignore creation errors if the directory already exists (e.g. mounting it ahead of time).
|
||||
// If it is something different then PG startup will error out anyway.
|
||||
let _ok = fs::create_dir(pgdata);
|
||||
} else {
|
||||
fs::create_dir(pgdata)?;
|
||||
}
|
||||
|
||||
fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
@@ -1050,12 +1219,10 @@ impl ComputeNode {
|
||||
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let started = Instant::now();
|
||||
|
||||
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
|
||||
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
|
||||
};
|
||||
|
||||
self.fix_zenith_signal_neon_signal()?;
|
||||
@@ -1093,23 +1260,20 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
|
||||
/// the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec
|
||||
.pageserver_connstr
|
||||
.split(',')
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
|
||||
0 | 1 => ShardIndex::unsharded(),
|
||||
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
|
||||
let shard0_index = ShardIndex {
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: spec.pageserver_conninfo.shard_count,
|
||||
};
|
||||
|
||||
let shard0_url = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Grpc)?
|
||||
.to_owned();
|
||||
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
|
||||
let mut client = page_api::Client::connect(
|
||||
shard0_connstr,
|
||||
shard0_url,
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
shard_index,
|
||||
shard0_index,
|
||||
spec.storage_auth_token.clone(),
|
||||
None, // NB: base backups use payload compression
|
||||
)
|
||||
@@ -1141,7 +1305,9 @@ impl ComputeNode {
|
||||
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
|
||||
/// when the connection was established, and the (compressed) size of the basebackup.
|
||||
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let shard0_connstr = spec
|
||||
.pageserver_conninfo
|
||||
.shard_url(ShardNumber(0), PageserverProtocol::Libpq)?;
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
@@ -1228,10 +1394,7 @@ impl ComputeNode {
|
||||
return result;
|
||||
}
|
||||
Err(ref e) if attempts < max_attempts => {
|
||||
warn!(
|
||||
"Failed to get basebackup: {} (attempt {}/{})",
|
||||
e, attempts, max_attempts
|
||||
);
|
||||
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
|
||||
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
}
|
||||
@@ -1402,6 +1565,8 @@ impl ComputeNode {
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
let databricks_settings = spec.databricks_settings.as_ref();
|
||||
let postgres_port = self.params.connstr.port();
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
@@ -1409,8 +1574,11 @@ impl ComputeNode {
|
||||
pgdata_path,
|
||||
&self.params,
|
||||
&pspec.spec,
|
||||
postgres_port,
|
||||
self.params.internal_http_port,
|
||||
tls_config,
|
||||
databricks_settings,
|
||||
self.params.lakebase_mode,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1439,19 +1607,31 @@ impl ComputeNode {
|
||||
}
|
||||
};
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(compute_state, lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &pspec.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
self.get_basebackup(compute_state, lsn)
|
||||
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path, None)?;
|
||||
if let Some(settings) = databricks_settings {
|
||||
copy_tls_certificates(
|
||||
&settings.pg_compute_tls_settings.key_file,
|
||||
&settings.pg_compute_tls_settings.cert_file,
|
||||
pgdata_path,
|
||||
)?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup including additional databricks settings.
|
||||
update_pg_hba(pgdata_path, Some(&settings.databricks_pg_hba))?;
|
||||
update_pg_ident(pgdata_path, Some(&settings.databricks_pg_ident))?;
|
||||
} else {
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path, None)?;
|
||||
}
|
||||
|
||||
if let Some(databricks_settings) = spec.databricks_settings.as_ref() {
|
||||
copy_tls_certificates(
|
||||
&databricks_settings.pg_compute_tls_settings.key_file,
|
||||
&databricks_settings.pg_compute_tls_settings.cert_file,
|
||||
pgdata_path,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Place pg_dynshmem under /dev/shm. This allows us to use
|
||||
// 'dynamic_shared_memory_type = mmap' so that the files are placed in
|
||||
@@ -1492,7 +1672,7 @@ impl ComputeNode {
|
||||
// symlink doesn't affect anything.
|
||||
//
|
||||
// See https://github.com/neondatabase/autoscaling/issues/800
|
||||
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
|
||||
std::fs::remove_dir_all(pgdata_path.join("pg_dynshmem"))?;
|
||||
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
|
||||
|
||||
match spec.mode {
|
||||
@@ -1507,6 +1687,12 @@ impl ComputeNode {
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
if self.params.lakebase_mode {
|
||||
// We are running in Hadron mode. Disabling this prewarming step for now as it could run
|
||||
// into dblet port conflicts and also doesn't add much value with our current infra.
|
||||
info!("Skipping postgres prewarming in Hadron mode");
|
||||
return Ok(());
|
||||
}
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
@@ -1564,14 +1750,36 @@ impl ComputeNode {
|
||||
pub fn start_postgres(&self, storage_auth_token: Option<String>) -> Result<PostgresHandle> {
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let env_vars: Vec<(String, String)> = if self.params.lakebase_mode {
|
||||
let databricks_env_vars = {
|
||||
let state = self.state.lock().unwrap();
|
||||
let spec = &state.pspec.as_ref().unwrap().spec;
|
||||
DatabricksEnvVars::new(
|
||||
spec,
|
||||
Some(&self.params.compute_id),
|
||||
self.params.instance_id.clone(),
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
};
|
||||
|
||||
info!(
|
||||
"Starting Postgres for databricks endpoint id: {}",
|
||||
&databricks_env_vars.endpoint_id
|
||||
);
|
||||
|
||||
let mut env_vars = databricks_env_vars.to_env_var_list();
|
||||
env_vars.extend(storage_auth_token.map(|t| ("NEON_AUTH_TOKEN".to_string(), t)));
|
||||
env_vars
|
||||
} else if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN".to_owned(), storage_auth_token.to_owned())]
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = maybe_cgexec(&self.params.pgbin)
|
||||
.args(["-D", &self.params.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.envs(env_vars)
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
@@ -1723,7 +1931,15 @@ impl ComputeNode {
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip_all)]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
|
||||
|
||||
if self.params.lakebase_mode {
|
||||
// Set a 2-minute statement_timeout for the session applying config. The individual SQL statements
|
||||
// used in apply_spec_sql() should not take long (they are just creating users and installing
|
||||
// extensions). If any of them are stuck for an extended period of time it usually indicates a
|
||||
// pageserver connectivity problem and we should bail out.
|
||||
conf.options("-c statement_timeout=2min");
|
||||
}
|
||||
|
||||
let conf = Arc::new(conf);
|
||||
let spec = Arc::new(
|
||||
@@ -1735,10 +1951,7 @@ impl ComputeNode {
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let mut tls_config = None::<TlsConfig>;
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
self.update_installed_extensions_collection_interval(&spec);
|
||||
|
||||
@@ -1785,6 +1998,34 @@ impl ComputeNode {
|
||||
Ok::<(), anyhow::Error>(())
|
||||
}
|
||||
|
||||
// Signal to the configurator to refresh the configuration by pulling a new spec from the HCC.
|
||||
// Note that this merely triggers a notification on a condition variable the configurator thread
|
||||
// waits on. The configurator thread (in configurator.rs) pulls the new spec from the HCC and
|
||||
// applies it.
|
||||
pub async fn signal_refresh_configuration(&self) -> Result<()> {
|
||||
let states_allowing_configuration_refresh = [
|
||||
ComputeStatus::Running,
|
||||
ComputeStatus::Failed,
|
||||
ComputeStatus::RefreshConfigurationPending,
|
||||
];
|
||||
|
||||
let mut state = self.state.lock().expect("state lock poisoned");
|
||||
if states_allowing_configuration_refresh.contains(&state.status) {
|
||||
state.status = ComputeStatus::RefreshConfigurationPending;
|
||||
self.state_changed.notify_all();
|
||||
Ok(())
|
||||
} else if state.status == ComputeStatus::Init {
|
||||
// If the compute is in Init state, we can't refresh the configuration immediately,
|
||||
// but we should be able to do that soon.
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow::anyhow!(
|
||||
"Cannot refresh compute configuration in state {:?}",
|
||||
state.status
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
// Wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop.
|
||||
#[instrument(skip_all)]
|
||||
@@ -1846,12 +2087,16 @@ impl ComputeNode {
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
let postgres_port = self.params.connstr.port();
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&self.params,
|
||||
&spec,
|
||||
postgres_port,
|
||||
self.params.internal_http_port,
|
||||
tls_config,
|
||||
spec.databricks_settings.as_ref(),
|
||||
self.params.lakebase_mode,
|
||||
)?;
|
||||
|
||||
self.pg_reload_conf()?;
|
||||
@@ -1888,6 +2133,60 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tell postgres/pgbouncer/local_proxy to reload their configurations.
|
||||
#[instrument(skip_all)]
|
||||
pub fn reload(&self, spec: ComputeSpec) -> Result<()> {
|
||||
let rt = tokio::runtime::Handle::current();
|
||||
if spec.pgbouncer_settings.is_some() {
|
||||
rt.block_on(reload_pgbouncer())?;
|
||||
}
|
||||
if spec.local_proxy_config.is_some() {
|
||||
local_proxy::reload()?;
|
||||
}
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||
info!("finished reload of compute node for operation {op_id}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Acquire the "reloading" lock while running the supplied function.
|
||||
///
|
||||
/// This ensures that this thread is the only thread that
|
||||
/// can issue signals to postgres.
|
||||
///
|
||||
/// If the supplied function errors, the compute status is marked as failed.
|
||||
pub fn lock_while_reloading<T>(
|
||||
&self,
|
||||
mut state: MutexGuard<'_, ComputeState>,
|
||||
f: impl FnOnce(ComputeSpec) -> Result<T>,
|
||||
) -> Result<T> {
|
||||
let old_status = state.status;
|
||||
|
||||
// transition to the reloading state.
|
||||
state.set_status(ComputeStatus::Reloading, &self.state_changed);
|
||||
let spec = state.pspec.as_ref().unwrap().spec.clone();
|
||||
// unlock while reloading, so we don't block other tasks.
|
||||
drop(state);
|
||||
|
||||
let res = f(spec);
|
||||
|
||||
let new_status = if res.is_ok() {
|
||||
old_status
|
||||
} else {
|
||||
ComputeStatus::Failed
|
||||
};
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
// make sure our invariants are upheld
|
||||
assert_eq!(state.status, ComputeStatus::Reloading);
|
||||
state.set_status(new_status, &self.state_changed);
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
@@ -1922,55 +2221,103 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn watch_cert_for_changes(self: Arc<Self>) {
|
||||
// update status on cert renewal
|
||||
if let Some(tls_config) = &self.compute_ctl_config.tls {
|
||||
let tls_config = tls_config.clone();
|
||||
pub fn watch_cert_for_changes(self: Arc<Self>, tls_config: TlsConfig) {
|
||||
// wait until the cert exists.
|
||||
let mut digest = crate::tls::compute_digest(&tls_config.cert_path);
|
||||
info!(
|
||||
cert_path = tls_config.cert_path,
|
||||
key_path = tls_config.key_path,
|
||||
"TLS certificates found"
|
||||
);
|
||||
|
||||
// wait until the cert exists.
|
||||
let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await;
|
||||
// ensure the keys are saved before continuing.
|
||||
let key_pair = crate::tls::load_certs_blocking(&tls_config);
|
||||
while let Err(e) =
|
||||
crate::tls::update_key_path_blocking(Path::new(&self.params.pgdata), &key_pair)
|
||||
{
|
||||
error!("could not save TLS certificates: {e}");
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
'cert_update: loop {
|
||||
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
|
||||
// we need to wait until it's configurable first.
|
||||
tokio::task::spawn_blocking(move || {
|
||||
'cert_update: loop {
|
||||
// wait for a new certificate update
|
||||
let new_digest = crate::tls::wait_until_cert_changed(digest, &tls_config.cert_path);
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
'status_update: loop {
|
||||
match state.status {
|
||||
// let's update the state to config pending
|
||||
ComputeStatus::ConfigurationPending | ComputeStatus::Running => {
|
||||
state.set_status(
|
||||
ComputeStatus::ConfigurationPending,
|
||||
&self.state_changed,
|
||||
);
|
||||
break 'status_update;
|
||||
}
|
||||
// load the corresponding keys
|
||||
let key_pair = crate::tls::load_certs_blocking(&tls_config);
|
||||
|
||||
// exit loop
|
||||
ComputeStatus::Failed
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => break 'cert_update,
|
||||
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
|
||||
// we need to wait until it's configurable first.
|
||||
|
||||
// wait
|
||||
ComputeStatus::Init
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::Empty => {
|
||||
state = self.state_changed.wait(state).unwrap();
|
||||
}
|
||||
let mut state = self.state.lock().unwrap();
|
||||
'status_update: loop {
|
||||
match state.status {
|
||||
// let's update the state to config pending
|
||||
ComputeStatus::Running => {
|
||||
info!("reloading compute due to TLS certificate renewal");
|
||||
break 'status_update;
|
||||
}
|
||||
|
||||
// exit loop
|
||||
ComputeStatus::Failed
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => break 'cert_update,
|
||||
|
||||
// wait
|
||||
ComputeStatus::Init
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::RefreshConfiguration
|
||||
| ComputeStatus::RefreshConfigurationPending
|
||||
| ComputeStatus::Reloading
|
||||
| ComputeStatus::Empty => {
|
||||
state = self.state_changed.wait(state).unwrap();
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
}
|
||||
|
||||
// wait for a new certificate update
|
||||
if handle.block_on(cert_watch.changed()).is_err() {
|
||||
break;
|
||||
let result = self.lock_while_reloading(state, |spec| {
|
||||
// ensure the keys are saved before continuing.
|
||||
// we do this while holding the 'reloading' state so that we know we're not interfering with any
|
||||
// active configuration stages.
|
||||
if let Err(e) = crate::tls::update_key_path_blocking(
|
||||
Path::new(&self.params.pgdata),
|
||||
&key_pair,
|
||||
) {
|
||||
return Ok(Err(e));
|
||||
}
|
||||
|
||||
// reload postgres/pgbouncer/local_proxy to pick up our new certificates.
|
||||
self.reload(spec)?;
|
||||
|
||||
Ok(Ok(()))
|
||||
});
|
||||
|
||||
match result {
|
||||
// Reload failed. Compute is in a bad state.
|
||||
Err(e) => {
|
||||
error!("could not reload compute node: {}", e);
|
||||
return;
|
||||
}
|
||||
// Updating the certificates failed. Retry
|
||||
Ok(Err(e)) => {
|
||||
error!("could not save TLS certificates: {e}");
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
// Successful. Acknowledge that we've saved these certificates.
|
||||
Ok(Ok(())) => {
|
||||
digest = new_digest;
|
||||
info!(
|
||||
cert_path = tls_config.cert_path,
|
||||
key_path = tls_config.key_path,
|
||||
"TLS certificates renewed",
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
|
||||
@@ -2007,7 +2354,17 @@ impl ComputeNode {
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
_ => Path::new(&self.params.pgdata),
|
||||
// BEGIN HADRON
|
||||
// NB: Read core dump files from a fixed location outside of
|
||||
// the data directory since `compute_ctl` wipes the data directory
|
||||
// across container restarts.
|
||||
_ => {
|
||||
if self.params.lakebase_mode {
|
||||
Path::new("/databricks/logs/brickstore")
|
||||
} else {
|
||||
Path::new(&self.params.pgdata)
|
||||
}
|
||||
} // END HADRON
|
||||
};
|
||||
|
||||
// Collect core dump paths if any
|
||||
@@ -2320,7 +2677,7 @@ LIMIT 100",
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2339,7 +2696,7 @@ LIMIT 100",
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.filter(|s| *s != "neon" && *s != "databricks_auth" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
@@ -2392,22 +2749,22 @@ LIMIT 100",
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
let old_pageserver_conninfo = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.pageserver_conninfo
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
let pageserver_conninfo = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
.pageserver_conninfo;
|
||||
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
@@ -2513,6 +2870,34 @@ LIMIT 100",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the compute spec and update related metrics.
|
||||
/// This is the central place where pspec is updated.
|
||||
pub fn set_spec(params: &ComputeNodeParams, state: &mut ComputeState, pspec: ParsedSpec) {
|
||||
state.pspec = Some(pspec);
|
||||
ComputeNode::update_attached_metric(params, state);
|
||||
let _ = logger::update_ids(¶ms.instance_id, &Some(params.compute_id.clone()));
|
||||
}
|
||||
|
||||
pub fn update_attached_metric(params: &ComputeNodeParams, state: &mut ComputeState) {
|
||||
// Update the pg_cctl_attached gauge when all identifiers are available.
|
||||
if let Some(instance_id) = ¶ms.instance_id {
|
||||
if let Some(pspec) = &state.pspec {
|
||||
// Clear all values in the metric
|
||||
COMPUTE_ATTACHED.reset();
|
||||
|
||||
// Set new metric value
|
||||
COMPUTE_ATTACHED
|
||||
.with_label_values(&[
|
||||
¶ms.compute_id,
|
||||
instance_id,
|
||||
&pspec.tenant_id.to_string(),
|
||||
&pspec.timeline_id.to_string(),
|
||||
])
|
||||
.set(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
|
||||
@@ -2637,7 +3022,10 @@ mod tests {
|
||||
|
||||
match ParsedSpec::try_from(spec.clone()) {
|
||||
Ok(_p) => panic!("Failed to detect duplicate entry"),
|
||||
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
|
||||
Err(e) => assert!(
|
||||
e.to_string()
|
||||
.starts_with("duplicate entry in safekeeper_connstrings:")
|
||||
),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,6 +90,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
|
||||
/// Has a failpoint "compute-prewarm"
|
||||
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
|
||||
{
|
||||
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
|
||||
@@ -112,9 +113,8 @@ impl ComputeNode {
|
||||
Err(err) => {
|
||||
crate::metrics::LFC_PREWARM_ERRORS.inc();
|
||||
error!(%err, "could not prewarm LFC");
|
||||
|
||||
LfcPrewarmState::Failed {
|
||||
error: err.to_string(),
|
||||
error: format!("{err:#}"),
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -135,16 +135,20 @@ impl ComputeNode {
|
||||
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fail::fail_point!("compute-prewarm", |_| {
|
||||
bail!("prewarm configured to fail because of a failpoint")
|
||||
});
|
||||
|
||||
info!(%url, "requesting LFC state from endpoint storage");
|
||||
let request = Client::new().get(&url).bearer_auth(token);
|
||||
let res = request.send().await.context("querying endpoint storage")?;
|
||||
let status = res.status();
|
||||
match status {
|
||||
match res.status() {
|
||||
StatusCode::OK => (),
|
||||
StatusCode::NOT_FOUND => {
|
||||
return Ok(false);
|
||||
}
|
||||
_ => bail!("{status} querying endpoint storage"),
|
||||
status => bail!("{status} querying endpoint storage"),
|
||||
}
|
||||
|
||||
let mut uncompressed = Vec::new();
|
||||
@@ -205,7 +209,7 @@ impl ComputeNode {
|
||||
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
|
||||
error!(%err, "could not offload LFC state to endpoint storage");
|
||||
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
|
||||
error: err.to_string(),
|
||||
error: format!("{err:#}"),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -213,16 +217,22 @@ impl ComputeNode {
|
||||
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
|
||||
info!(%url, "requesting LFC state from Postgres");
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
|
||||
.await
|
||||
.context("connecting to postgres")?
|
||||
.query_one("select neon.get_local_cache_state()", &[])
|
||||
.await
|
||||
.context("querying LFC state")?
|
||||
.try_get::<usize, &[u8]>(0)
|
||||
.context("deserializing LFC state")
|
||||
.map(ZstdEncoder::new)?
|
||||
.context("querying LFC state")?;
|
||||
let state = row
|
||||
.try_get::<usize, Option<&[u8]>>(0)
|
||||
.context("deserializing LFC state")?;
|
||||
let Some(state) = state else {
|
||||
info!(%url, "empty LFC state, not exporting");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let mut compressed = Vec::new();
|
||||
ZstdEncoder::new(state)
|
||||
.read_to_end(&mut compressed)
|
||||
.await
|
||||
.context("compressing LFC state")?;
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::{
|
||||
responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
|
||||
spec::ComputeMode,
|
||||
};
|
||||
use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use itertools::Itertools;
|
||||
use std::collections::HashMap;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::sleep;
|
||||
use tracing::info;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl ComputeNode {
|
||||
@@ -13,21 +14,22 @@ impl ComputeNode {
|
||||
/// and http client disconnects, this does not stop promotion, and subsequent
|
||||
/// calls block until promote finishes.
|
||||
/// Called by control plane on secondary after primary endpoint is terminated
|
||||
pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
|
||||
/// Has a failpoint "compute-promotion"
|
||||
pub async fn promote(self: &Arc<Self>, cfg: PromoteConfig) -> PromoteState {
|
||||
let cloned = self.clone();
|
||||
let promote_fn = async move || {
|
||||
let Err(err) = cloned.promote_impl(cfg).await else {
|
||||
return PromoteState::Completed;
|
||||
};
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: format!("{err:#}"),
|
||||
}
|
||||
};
|
||||
|
||||
let start_promotion = || {
|
||||
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
|
||||
tokio::spawn(async move {
|
||||
tx.send(match cloned.promote_impl(safekeepers_lsn).await {
|
||||
Ok(_) => PromoteState::Completed,
|
||||
Err(err) => {
|
||||
tracing::error!(%err, "promoting");
|
||||
PromoteState::Failed {
|
||||
error: err.to_string(),
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
tokio::spawn(async move { tx.send(promote_fn().await) });
|
||||
rx
|
||||
};
|
||||
|
||||
@@ -47,9 +49,7 @@ impl ComputeNode {
|
||||
task.borrow().clone()
|
||||
}
|
||||
|
||||
// Why do we have to supply safekeepers?
|
||||
// For secondary we use primary_connection_conninfo so safekeepers field is empty
|
||||
async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
|
||||
async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> {
|
||||
{
|
||||
let state = self.state.lock().unwrap();
|
||||
let mode = &state.pspec.as_ref().unwrap().spec.mode;
|
||||
@@ -73,7 +73,7 @@ impl ComputeNode {
|
||||
.await
|
||||
.context("connecting to postgres")?;
|
||||
|
||||
let primary_lsn = safekeepers_lsn.wal_flush_lsn;
|
||||
let primary_lsn = cfg.wal_flush_lsn;
|
||||
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
|
||||
const RETRIES: i32 = 20;
|
||||
for i in 0..=RETRIES {
|
||||
@@ -86,7 +86,7 @@ impl ComputeNode {
|
||||
if last_wal_replay_lsn >= primary_lsn {
|
||||
break;
|
||||
}
|
||||
tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
if last_wal_replay_lsn < primary_lsn {
|
||||
@@ -96,7 +96,7 @@ impl ComputeNode {
|
||||
// using $1 doesn't work with ALTER SYSTEM SET
|
||||
let safekeepers_sql = format!(
|
||||
"ALTER SYSTEM SET neon.safekeepers='{}'",
|
||||
safekeepers_lsn.safekeepers
|
||||
cfg.spec.safekeeper_connstrings.join(",")
|
||||
);
|
||||
client
|
||||
.query(&safekeepers_sql, &[])
|
||||
@@ -106,6 +106,12 @@ impl ComputeNode {
|
||||
.query("SELECT pg_reload_conf()", &[])
|
||||
.await
|
||||
.context("reloading postgres config")?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fail::fail_point!("compute-promotion", |_| {
|
||||
bail!("promotion configured to fail because of a failpoint")
|
||||
});
|
||||
|
||||
let row = client
|
||||
.query_one("SELECT * FROM pg_promote()", &[])
|
||||
.await
|
||||
@@ -125,8 +131,36 @@ impl ComputeNode {
|
||||
bail!("replica in read only mode after promotion");
|
||||
}
|
||||
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
|
||||
Ok(())
|
||||
{
|
||||
let mut state = self.state.lock().unwrap();
|
||||
let spec = &mut state.pspec.as_mut().unwrap().spec;
|
||||
spec.mode = ComputeMode::Primary;
|
||||
let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap();
|
||||
let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap();
|
||||
Self::merge_spec(new_conf, existing_conf);
|
||||
}
|
||||
info!("applied new spec, reconfiguring as primary");
|
||||
self.reconfigure()
|
||||
}
|
||||
|
||||
/// Merge old and new Postgres conf specs to apply on secondary.
|
||||
/// Change new spec's port and safekeepers since they are supplied
|
||||
/// differenly
|
||||
fn merge_spec(new_conf: &mut String, existing_conf: &str) {
|
||||
let mut new_conf_set: HashMap<&str, &str> = new_conf
|
||||
.split_terminator('\n')
|
||||
.map(|e| e.split_once("=").expect("invalid item"))
|
||||
.collect();
|
||||
new_conf_set.remove("neon.safekeepers");
|
||||
|
||||
let existing_conf_set: HashMap<&str, &str> = existing_conf
|
||||
.split_terminator('\n')
|
||||
.map(|e| e.split_once("=").expect("invalid item"))
|
||||
.collect();
|
||||
new_conf_set.insert("port", existing_conf_set["port"]);
|
||||
*new_conf = new_conf_set
|
||||
.iter()
|
||||
.map(|(k, v)| format!("{k}={v}"))
|
||||
.join("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,13 +7,18 @@ use std::io::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
use compute_api::responses::TlsConfig;
|
||||
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
|
||||
};
|
||||
|
||||
use crate::compute::ComputeNodeParams;
|
||||
use crate::pg_helpers::{
|
||||
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
|
||||
DatabricksSettingsExt as _, GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize,
|
||||
escape_conf_value,
|
||||
};
|
||||
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
|
||||
use crate::tls::{SERVER_CRT, SERVER_KEY};
|
||||
|
||||
use utils::shard::{ShardIndex, ShardNumber};
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -40,12 +45,16 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn write_postgres_conf(
|
||||
pgdata_path: &Path,
|
||||
params: &ComputeNodeParams,
|
||||
spec: &ComputeSpec,
|
||||
postgres_port: Option<u16>,
|
||||
extension_server_port: u16,
|
||||
tls_config: &Option<TlsConfig>,
|
||||
databricks_settings: Option<&DatabricksSettings>,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()> {
|
||||
let path = pgdata_path.join("postgresql.conf");
|
||||
// File::create() destroys the file content if it exists.
|
||||
@@ -62,9 +71,75 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
// Add options for connecting to storage
|
||||
writeln!(file, "# Neon storage settings")?;
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
writeln!(file)?;
|
||||
if let Some(conninfo) = &spec.pageserver_connection_info {
|
||||
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
|
||||
let num_shards = if conninfo.shard_count.0 == 0 {
|
||||
1 // unsharded, treat it as a single shard
|
||||
} else {
|
||||
conninfo.shard_count.0
|
||||
};
|
||||
|
||||
for shard_number in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number: ShardNumber(shard_number),
|
||||
shard_count: conninfo.shard_count,
|
||||
};
|
||||
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"shard {shard_index} missing from pageserver_connection_info shard map"
|
||||
)
|
||||
})?;
|
||||
|
||||
let first_pageserver = info
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("must have at least one pageserver");
|
||||
|
||||
// Add the libpq URL to the array, or if the URL is missing, reset the array
|
||||
// forgetting any previous entries. All servers must have a libpq URL, or none
|
||||
// at all.
|
||||
if let Some(url) = &first_pageserver.libpq_url {
|
||||
if let Some(ref mut urls) = libpq_urls {
|
||||
urls.push(url.clone());
|
||||
}
|
||||
} else {
|
||||
libpq_urls = None
|
||||
}
|
||||
}
|
||||
if let Some(libpq_urls) = libpq_urls {
|
||||
writeln!(
|
||||
file,
|
||||
"# derived from compute spec's pageserver_conninfo field"
|
||||
)?;
|
||||
writeln!(
|
||||
file,
|
||||
"neon.pageserver_connstring={}",
|
||||
escape_conf_value(&libpq_urls.join(","))
|
||||
)?;
|
||||
} else {
|
||||
writeln!(file, "# no neon.pageserver_connstring")?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = conninfo.stripe_size {
|
||||
writeln!(
|
||||
file,
|
||||
"# from compute spec's pageserver_conninfo.stripe_size field"
|
||||
)?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
} else {
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "# from compute spec's pageserver_connstring field")?;
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "# from compute spec's shard_stripe_size field")?;
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
}
|
||||
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
let mut neon_safekeepers_value = String::new();
|
||||
tracing::info!(
|
||||
@@ -103,14 +178,9 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
|
||||
// tls
|
||||
if let Some(tls_config) = tls_config {
|
||||
if tls_config.is_some() {
|
||||
writeln!(file, "ssl = on")?;
|
||||
|
||||
// postgres requires the keyfile to be in a secure file,
|
||||
// currently too complicated to ensure that at the VM level,
|
||||
// so we just copy them to another file instead. :shrug:
|
||||
tls::update_key_path_blocking(pgdata_path, tls_config);
|
||||
|
||||
// these are the default, but good to be explicit.
|
||||
writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?;
|
||||
writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?;
|
||||
@@ -285,6 +355,24 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "log_destination='stderr,syslog'")?;
|
||||
}
|
||||
|
||||
if lakebase_mode {
|
||||
// Explicitly set the port based on the connstr, overriding any previous port setting.
|
||||
// Note: It is important that we don't specify a different port again after this.
|
||||
let port = postgres_port.expect("port must be present in connstr");
|
||||
writeln!(file, "port = {port}")?;
|
||||
|
||||
// This is databricks specific settings.
|
||||
// This should be at the end of the file but before `compute_ctl_temp_override.conf` below
|
||||
// so that it can override any settings above.
|
||||
// `compute_ctl_temp_override.conf` is intended to override any settings above during specific operations.
|
||||
// To prevent potential breakage in the future, we keep it above `compute_ctl_temp_override.conf`.
|
||||
writeln!(file, "# Databricks settings start")?;
|
||||
if let Some(settings) = databricks_settings {
|
||||
writeln!(file, "{}", settings.as_pg_settings())?;
|
||||
}
|
||||
writeln!(file, "# Databricks settings end")?;
|
||||
}
|
||||
|
||||
// This is essential to keep this line at the end of the file,
|
||||
// because it is intended to override any settings above.
|
||||
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
||||
|
||||
@@ -1,23 +1,40 @@
|
||||
use std::sync::Arc;
|
||||
use std::fs::File;
|
||||
use std::thread;
|
||||
use std::{path::Path, sync::Arc};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{ComputeConfig, ComputeStatus};
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::compute::{ComputeNode, ParsedSpec};
|
||||
use crate::spec::get_config_from_control_plane;
|
||||
|
||||
#[instrument(skip_all)]
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
/* BEGIN_HADRON */
|
||||
// RefreshConfiguration should only be used inside the loop
|
||||
assert_ne!(state.status, ComputeStatus::RefreshConfiguration);
|
||||
/* END_HADRON */
|
||||
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
if compute.params.lakebase_mode {
|
||||
while state.status != ComputeStatus::ConfigurationPending
|
||||
&& state.status != ComputeStatus::RefreshConfigurationPending
|
||||
&& state.status != ComputeStatus::Failed
|
||||
{
|
||||
info!("configurator: compute status: {:?}, sleeping", state.status);
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
} else {
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Re-check the status after waking up
|
||||
@@ -37,6 +54,136 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
// XXX: used to test that API is blocking
|
||||
// std::thread::sleep(std::time::Duration::from_millis(10000));
|
||||
|
||||
compute.set_status(new_status);
|
||||
} else if state.status == ComputeStatus::RefreshConfigurationPending {
|
||||
info!(
|
||||
"compute node suspects its configuration is out of date, now refreshing configuration"
|
||||
);
|
||||
state.set_status(ComputeStatus::RefreshConfiguration, &compute.state_changed);
|
||||
// Drop the lock guard here to avoid holding the lock while downloading config from the control plane / HCC.
|
||||
// This is the only thread that can move compute_ctl out of the `RefreshConfiguration` state, so it
|
||||
// is safe to drop the lock like this.
|
||||
drop(state);
|
||||
|
||||
let get_config_result: anyhow::Result<ComputeConfig> =
|
||||
if let Some(config_path) = &compute.params.config_path_test_only {
|
||||
// This path is only to make testing easier. In production we always get the config from the HCC.
|
||||
info!(
|
||||
"reloading config.json from path: {}",
|
||||
config_path.to_string_lossy()
|
||||
);
|
||||
let path = Path::new(config_path);
|
||||
if let Ok(file) = File::open(path) {
|
||||
match serde_json::from_reader::<File, ComputeConfig>(file) {
|
||||
Ok(config) => Ok(config),
|
||||
Err(e) => {
|
||||
error!("could not parse config file: {}", e);
|
||||
Err(anyhow::anyhow!("could not parse config file: {}", e))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
error!(
|
||||
"could not open config file at path: {:?}",
|
||||
config_path.to_string_lossy()
|
||||
);
|
||||
Err(anyhow::anyhow!(
|
||||
"could not open config file at path: {}",
|
||||
config_path.to_string_lossy()
|
||||
))
|
||||
}
|
||||
} else if let Some(control_plane_uri) = &compute.params.control_plane_uri {
|
||||
get_config_from_control_plane(control_plane_uri, &compute.params.compute_id)
|
||||
} else {
|
||||
Err(anyhow::anyhow!("config_path_test_only is not set"))
|
||||
};
|
||||
|
||||
// Parse any received ComputeSpec and transpose the result into a Result<Option<ParsedSpec>>.
|
||||
let parsed_spec_result: Result<Option<ParsedSpec>> =
|
||||
get_config_result.and_then(|config| {
|
||||
if let Some(spec) = config.spec {
|
||||
if let Ok(pspec) = ParsedSpec::try_from(spec) {
|
||||
Ok(Some(pspec))
|
||||
} else {
|
||||
Err(anyhow::anyhow!("could not parse spec"))
|
||||
}
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
});
|
||||
|
||||
let new_status: ComputeStatus;
|
||||
match parsed_spec_result {
|
||||
// Control plane (HCM) returned a spec and we were able to parse it.
|
||||
Ok(Some(pspec)) => {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
// Defensive programming to make sure this thread is indeed the only one that can move the compute
|
||||
// node out of the `RefreshConfiguration` state. Would be nice if we can encode this invariant
|
||||
// into the type system.
|
||||
assert_eq!(state.status, ComputeStatus::RefreshConfiguration);
|
||||
|
||||
if state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|ps| ps.pageserver_conninfo.clone())
|
||||
== Some(pspec.pageserver_conninfo.clone())
|
||||
{
|
||||
info!(
|
||||
"Refresh configuration: Retrieved spec is the same as the current spec. Waiting for control plane to update the spec before attempting reconfiguration."
|
||||
);
|
||||
state.status = ComputeStatus::Running;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
std::thread::sleep(std::time::Duration::from_secs(5));
|
||||
continue;
|
||||
}
|
||||
// state.pspec is consumed by compute.reconfigure() below. Note that compute.reconfigure() will acquire
|
||||
// the compute.state lock again so we need to have the lock guard go out of scope here. We could add a
|
||||
// "locked" variant of compute.reconfigure() that takes the lock guard as an argument to make this cleaner,
|
||||
// but it's not worth forking the codebase too much for this minor point alone right now.
|
||||
state.pspec = Some(pspec);
|
||||
}
|
||||
match compute.reconfigure() {
|
||||
Ok(_) => {
|
||||
info!("Refresh configuration: compute node configured");
|
||||
new_status = ComputeStatus::Running;
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Refresh configuration: could not configure compute node: {}",
|
||||
e
|
||||
);
|
||||
// Set the compute node back to the `RefreshConfigurationPending` state if the configuration
|
||||
// was not successful. It should be okay to treat this situation the same as if the loop
|
||||
// hasn't executed yet as long as the detection side keeps notifying.
|
||||
new_status = ComputeStatus::RefreshConfigurationPending;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Control plane (HCM)'s response does not contain a spec. This is the "Empty" attachment case.
|
||||
Ok(None) => {
|
||||
info!(
|
||||
"Compute Manager signaled that this compute is no longer attached to any storage. Exiting."
|
||||
);
|
||||
// We just immediately terminate the whole compute_ctl in this case. It's not necessary to attempt a
|
||||
// clean shutdown as Postgres is probably not responding anyway (which is why we are in this refresh
|
||||
// configuration state).
|
||||
std::process::exit(1);
|
||||
}
|
||||
// Various error cases:
|
||||
// - The request to the control plane (HCM) either failed or returned a malformed spec.
|
||||
// - compute_ctl itself is configured incorrectly (e.g., compute_id is not set).
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Refresh configuration: error getting a parsed spec: {:?}",
|
||||
e
|
||||
);
|
||||
new_status = ComputeStatus::RefreshConfigurationPending;
|
||||
// We may be dealing with an overloaded HCM if we end up in this path. Backoff 5 seconds before
|
||||
// retrying to avoid hammering the HCM.
|
||||
std::thread::sleep(std::time::Duration::from_secs(5));
|
||||
}
|
||||
}
|
||||
compute.set_status(new_status);
|
||||
} else if state.status == ComputeStatus::Failed {
|
||||
info!("compute node is now in Failed state, exiting");
|
||||
|
||||
@@ -16,13 +16,29 @@ use crate::http::JsonResponse;
|
||||
#[derive(Clone, Debug)]
|
||||
pub(in crate::http) struct Authorize {
|
||||
compute_id: String,
|
||||
// BEGIN HADRON
|
||||
// Hadron instance ID. Only set if it's a Lakebase V1 a.k.a. Hadron instance.
|
||||
instance_id: Option<String>,
|
||||
// END HADRON
|
||||
jwks: JwkSet,
|
||||
validation: Validation,
|
||||
}
|
||||
|
||||
impl Authorize {
|
||||
pub fn new(compute_id: String, jwks: JwkSet) -> Self {
|
||||
pub fn new(compute_id: String, instance_id: Option<String>, jwks: JwkSet) -> Self {
|
||||
let mut validation = Validation::new(Algorithm::EdDSA);
|
||||
|
||||
// BEGIN HADRON
|
||||
let use_rsa = jwks.keys.iter().any(|jwk| {
|
||||
jwk.common
|
||||
.key_algorithm
|
||||
.is_some_and(|alg| alg == jsonwebtoken::jwk::KeyAlgorithm::RS256)
|
||||
});
|
||||
if use_rsa {
|
||||
validation = Validation::new(Algorithm::RS256);
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
validation.validate_exp = true;
|
||||
// Unused by the control plane
|
||||
validation.validate_nbf = false;
|
||||
@@ -34,6 +50,7 @@ impl Authorize {
|
||||
|
||||
Self {
|
||||
compute_id,
|
||||
instance_id,
|
||||
jwks,
|
||||
validation,
|
||||
}
|
||||
@@ -47,10 +64,20 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
|
||||
fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
|
||||
let compute_id = self.compute_id.clone();
|
||||
let is_hadron_instance = self.instance_id.is_some();
|
||||
let jwks = self.jwks.clone();
|
||||
let validation = self.validation.clone();
|
||||
|
||||
Box::pin(async move {
|
||||
// BEGIN HADRON
|
||||
// In Hadron deployments the "external" HTTP endpoint on compute_ctl can only be
|
||||
// accessed by trusted components (enforced by dblet network policy), so we can bypass
|
||||
// all auth here.
|
||||
if is_hadron_instance {
|
||||
return Ok(request);
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
let TypedHeader(Authorization(bearer)) = request
|
||||
.extract_parts::<TypedHeader<Authorization<Bearer>>>()
|
||||
.await
|
||||
|
||||
@@ -96,7 +96,7 @@ paths:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SafekeepersLsn"
|
||||
$ref: "#/components/schemas/ComputeSchemaWithLsn"
|
||||
responses:
|
||||
200:
|
||||
description: Promote succeeded or wasn't started
|
||||
@@ -297,14 +297,7 @@ paths:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
properties:
|
||||
spec:
|
||||
# XXX: I don't want to explain current spec in the OpenAPI format,
|
||||
# as it could be changed really soon. Consider doing it later.
|
||||
type: object
|
||||
$ref: "#/components/schemas/ComputeSchema"
|
||||
responses:
|
||||
200:
|
||||
description: Compute configuration finished.
|
||||
@@ -591,18 +584,25 @@ components:
|
||||
type: string
|
||||
example: "1.0.0"
|
||||
|
||||
SafekeepersLsn:
|
||||
ComputeSchema:
|
||||
type: object
|
||||
required:
|
||||
- safekeepers
|
||||
- spec
|
||||
properties:
|
||||
spec:
|
||||
type: object
|
||||
ComputeSchemaWithLsn:
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
- wal_flush_lsn
|
||||
properties:
|
||||
safekeepers:
|
||||
description: Primary replica safekeepers
|
||||
type: string
|
||||
spec:
|
||||
$ref: "#/components/schemas/ComputeState"
|
||||
wal_flush_lsn:
|
||||
description: Primary last WAL flush LSN
|
||||
type: string
|
||||
description: "last WAL flush LSN"
|
||||
example: "0/028F10D8"
|
||||
|
||||
LfcPrewarmState:
|
||||
type: object
|
||||
|
||||
@@ -12,8 +12,10 @@ use crate::http::JsonResponse;
|
||||
/// Check that the compute is currently running.
|
||||
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
match status {
|
||||
// If we are running, or just reloading the config, we are ok to write a new config.
|
||||
ComputeStatus::Running | ComputeStatus::Reloading => {}
|
||||
_ => return JsonResponse::invalid_status(status),
|
||||
}
|
||||
|
||||
match check_writability(&compute).await {
|
||||
|
||||
@@ -27,27 +27,6 @@ pub(in crate::http) async fn configure(
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
// XXX: wrap state update under lock in a code block. Otherwise, we will try
|
||||
// to `Send` `mut state` into the spawned thread bellow, which will cause
|
||||
// the following rustc error:
|
||||
//
|
||||
// error: future cannot be sent between threads safely
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
// Pass the tracing span to the main thread that performs the startup,
|
||||
// so that the start_compute operation is considered a child of this
|
||||
// configure request for tracing purposes.
|
||||
state.startup_span = Some(tracing::Span::current());
|
||||
|
||||
state.pspec = Some(pspec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running. This is
|
||||
// needed to not block the main pool of workers and to be able to serve
|
||||
// other requests while some particular request is waiting for compute to
|
||||
@@ -55,6 +34,32 @@ pub(in crate::http) async fn configure(
|
||||
let c = compute.clone();
|
||||
let completed = task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
loop {
|
||||
match state.status {
|
||||
// ideal state.
|
||||
ComputeStatus::Empty | ComputeStatus::Running => break,
|
||||
// we need to wait until reloaded
|
||||
ComputeStatus::Reloading => {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
}
|
||||
// All other cases are unexpected.
|
||||
_ => return Err(JsonResponse::invalid_status(state.status)),
|
||||
}
|
||||
}
|
||||
|
||||
// Pass the tracing span to the main thread that performs the startup,
|
||||
// so that the start_compute operation is considered a child of this
|
||||
// configure request for tracing purposes.
|
||||
state.startup_span = Some(tracing::Span::current());
|
||||
|
||||
if c.params.lakebase_mode {
|
||||
ComputeNode::set_spec(&c.params, &mut state, pspec);
|
||||
} else {
|
||||
state.pspec = Some(pspec);
|
||||
}
|
||||
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &c.state_changed);
|
||||
|
||||
while state.status != ComputeStatus::Running {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
@@ -66,7 +71,7 @@ pub(in crate::http) async fn configure(
|
||||
if state.status == ComputeStatus::Failed {
|
||||
let err = state.error.as_ref().map_or("unknown error", |x| x);
|
||||
let msg = format!("compute configuration failed: {err:?}");
|
||||
return Err(msg);
|
||||
return Err(JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, msg));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,7 +81,7 @@ pub(in crate::http) async fn configure(
|
||||
.unwrap();
|
||||
|
||||
if let Err(e) = completed {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
|
||||
return e;
|
||||
}
|
||||
|
||||
// Return current compute state if everything went well.
|
||||
|
||||
34
compute_tools/src/http/routes/hadron_liveness_probe.rs
Normal file
34
compute_tools/src/http/routes/hadron_liveness_probe.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use crate::pg_isready::pg_isready;
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
use axum::{extract::State, http::StatusCode, response::Response};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// NOTE: NOT ENABLED YET
|
||||
/// Detect if the compute is alive.
|
||||
/// Called by the liveness probe of the compute container.
|
||||
pub(in crate::http) async fn hadron_liveness_probe(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
let port = match compute.params.connstr.port() {
|
||||
Some(port) => port,
|
||||
None => {
|
||||
return JsonResponse::error(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
"Failed to get the port from the connection string",
|
||||
);
|
||||
}
|
||||
};
|
||||
match pg_isready(&compute.params.pg_isready_bin, port) {
|
||||
Ok(_) => {
|
||||
// The connection is successful, so the compute is alive.
|
||||
// Return a 200 OK response.
|
||||
JsonResponse::success(StatusCode::OK, "ok")
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Hadron liveness probe failed: {}", e);
|
||||
// The connection failed, so the compute is not alive.
|
||||
// Return a 500 Internal Server Error response.
|
||||
JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ use metrics::{Encoder, TextEncoder};
|
||||
|
||||
use crate::communicator_socket_client::connect_communicator_socket;
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::hadron_metrics;
|
||||
use crate::http::JsonResponse;
|
||||
use crate::metrics::collect;
|
||||
|
||||
@@ -21,11 +22,18 @@ pub(in crate::http) async fn get_metrics() -> Response {
|
||||
// When we call TextEncoder::encode() below, it will immediately return an
|
||||
// error if a metric family has no metrics, so we need to preemptively
|
||||
// filter out metric families with no metrics.
|
||||
let metrics = collect()
|
||||
let mut metrics = collect()
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect::<Vec<MetricFamily>>();
|
||||
|
||||
// Add Hadron metrics.
|
||||
let hadron_metrics: Vec<MetricFamily> = hadron_metrics::collect()
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect();
|
||||
metrics.extend(hadron_metrics);
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = vec![];
|
||||
|
||||
|
||||
@@ -10,11 +10,13 @@ pub(in crate::http) mod extension_server;
|
||||
pub(in crate::http) mod extensions;
|
||||
pub(in crate::http) mod failpoints;
|
||||
pub(in crate::http) mod grants;
|
||||
pub(in crate::http) mod hadron_liveness_probe;
|
||||
pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod lfc;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod promote;
|
||||
pub(in crate::http) mod refresh_configuration;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use crate::http::JsonResponse;
|
||||
use axum::Form;
|
||||
use axum::extract::Json;
|
||||
use http::StatusCode;
|
||||
|
||||
pub(in crate::http) async fn promote(
|
||||
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
|
||||
Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
|
||||
Json(cfg): Json<compute_api::responses::PromoteConfig>,
|
||||
) -> axum::response::Response {
|
||||
let state = compute.promote(safekeepers_lsn).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error } = state {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
|
||||
let state = compute.promote(cfg).await;
|
||||
if let compute_api::responses::PromoteState::Failed { error: _ } = state {
|
||||
return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state);
|
||||
}
|
||||
JsonResponse::success(StatusCode::OK, state)
|
||||
}
|
||||
|
||||
29
compute_tools/src/http/routes/refresh_configuration.rs
Normal file
29
compute_tools/src/http/routes/refresh_configuration.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
// This file is added by Hadron
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{
|
||||
extract::State,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::hadron_metrics::POSTGRES_PAGESTREAM_REQUEST_ERRORS;
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// The /refresh_configuration POST method is used to nudge compute_ctl to pull a new spec
|
||||
/// from the HCC and attempt to reconfigure Postgres with the new spec. The method does not wait
|
||||
/// for the reconfiguration to complete. Rather, it simply delivers a signal that will cause
|
||||
/// configuration to be reloaded in a best effort manner. Invocation of this method does not
|
||||
/// guarantee that a reconfiguration will occur. The caller should consider keep sending this
|
||||
/// request while it believes that the compute configuration is out of date.
|
||||
pub(in crate::http) async fn refresh_configuration(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
POSTGRES_PAGESTREAM_REQUEST_ERRORS.inc();
|
||||
match compute.signal_refresh_configuration().await {
|
||||
Ok(_) => StatusCode::OK.into_response(),
|
||||
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::compute::{ComputeNode, forward_termination_signal};
|
||||
use crate::http::JsonResponse;
|
||||
use axum::extract::State;
|
||||
use axum::response::Response;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum_extra::extract::OptionalQuery;
|
||||
use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
|
||||
use http::StatusCode;
|
||||
@@ -33,7 +33,29 @@ pub(in crate::http) async fn terminate(
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
// If compute is Empty, there's no Postgres to terminate. The regular compute_ctl termination path
|
||||
// assumes Postgres to be configured and running, so we just special-handle this case by exiting
|
||||
// the process directly.
|
||||
if compute.params.lakebase_mode && state.status == ComputeStatus::Empty {
|
||||
drop(state);
|
||||
info!("terminating empty compute - will exit process");
|
||||
|
||||
// Queue a task to exit the process after 5 seconds. The 5-second delay aims to
|
||||
// give enough time for the HTTP response to be sent so that HCM doesn't get an abrupt
|
||||
// connection termination.
|
||||
tokio::spawn(async {
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||
info!("exiting process after terminating empty compute");
|
||||
std::process::exit(0);
|
||||
});
|
||||
|
||||
return StatusCode::OK.into_response();
|
||||
}
|
||||
|
||||
// For Running status, proceed with normal termination
|
||||
state.set_status(mode.into(), &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
forward_termination_signal(false);
|
||||
|
||||
@@ -23,7 +23,8 @@ use super::{
|
||||
middleware::authorize::Authorize,
|
||||
routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
|
||||
grants, hadron_liveness_probe, insights, lfc, metrics, metrics_json, promote,
|
||||
refresh_configuration, status, terminate,
|
||||
},
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
@@ -43,6 +44,7 @@ pub enum Server {
|
||||
port: u16,
|
||||
config: ComputeCtlConfig,
|
||||
compute_id: String,
|
||||
instance_id: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -67,7 +69,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
post(extension_server::download_extension),
|
||||
)
|
||||
.route("/extensions", post(extensions::install_extension))
|
||||
.route("/grants", post(grants::add_grant));
|
||||
.route("/grants", post(grants::add_grant))
|
||||
// Hadron: Compute-initiated configuration refresh
|
||||
.route(
|
||||
"/refresh_configuration",
|
||||
post(refresh_configuration::refresh_configuration),
|
||||
);
|
||||
|
||||
// Add in any testing support
|
||||
if cfg!(feature = "testing") {
|
||||
@@ -79,7 +86,10 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
router
|
||||
}
|
||||
Server::External {
|
||||
config, compute_id, ..
|
||||
config,
|
||||
compute_id,
|
||||
instance_id,
|
||||
..
|
||||
} => {
|
||||
let unauthenticated_router = Router::<Arc<ComputeNode>>::new()
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
@@ -100,8 +110,13 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate))
|
||||
.route(
|
||||
"/hadron_liveness_probe",
|
||||
get(hadron_liveness_probe::hadron_liveness_probe),
|
||||
)
|
||||
.layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
|
||||
compute_id.clone(),
|
||||
instance_id.clone(),
|
||||
config.jwks.clone(),
|
||||
)));
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use anyhow::Result;
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio_postgres::error::Error as PostgresError;
|
||||
use tokio_postgres::{Client, Config, NoTls};
|
||||
|
||||
@@ -119,3 +120,7 @@ pub async fn get_installed_extensions(
|
||||
extensions: extensions_map.into_values().collect(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn initialize_metrics() {
|
||||
Lazy::force(&INSTALLED_EXTENSIONS);
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod pg_isready;
|
||||
pub mod pgbouncer;
|
||||
pub mod rsyslog;
|
||||
pub mod spec;
|
||||
|
||||
@@ -11,9 +11,11 @@ use utils::pid_file::{self, PidFileRead};
|
||||
|
||||
pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> {
|
||||
write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?;
|
||||
notify_local_proxy("/etc/local_proxy/pid".as_ref())?;
|
||||
reload()
|
||||
}
|
||||
|
||||
Ok(())
|
||||
pub fn reload() -> Result<()> {
|
||||
notify_local_proxy("/etc/local_proxy/pid".as_ref())
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{LazyLock, RwLock};
|
||||
use tracing::Subscriber;
|
||||
use tracing::info;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_appender;
|
||||
use tracing_subscriber::prelude::*;
|
||||
use tracing_subscriber::{fmt, layer::SubscriberExt, registry::LookupSpan};
|
||||
|
||||
/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
@@ -15,16 +18,44 @@ use tracing_subscriber::prelude::*;
|
||||
///
|
||||
pub fn init_tracing_and_logging(
|
||||
default_log_level: &str,
|
||||
) -> anyhow::Result<Option<tracing_utils::Provider>> {
|
||||
log_dir_opt: &Option<String>,
|
||||
) -> anyhow::Result<(
|
||||
Option<tracing_utils::Provider>,
|
||||
Option<tracing_appender::non_blocking::WorkerGuard>,
|
||||
)> {
|
||||
// Initialize Logging
|
||||
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
|
||||
|
||||
// Standard output streams
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_target(false)
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
// Logs with file rotation. Files in `$log_dir/pgcctl.yyyy-MM-dd`
|
||||
let (json_to_file_layer, _file_logs_guard) = if let Some(log_dir) = log_dir_opt {
|
||||
std::fs::create_dir_all(log_dir)?;
|
||||
let file_logs_appender = tracing_appender::rolling::RollingFileAppender::builder()
|
||||
.rotation(tracing_appender::rolling::Rotation::DAILY)
|
||||
.filename_prefix("pgcctl")
|
||||
// Lib appends to existing files, so we will keep files for up to 2 days even on restart loops.
|
||||
// At minimum, log-daemon will have 1 day to detect and upload a file (if created right before midnight).
|
||||
.max_log_files(2)
|
||||
.build(log_dir)
|
||||
.expect("Initializing rolling file appender should succeed");
|
||||
let (file_logs_writer, _file_logs_guard) =
|
||||
tracing_appender::non_blocking(file_logs_appender);
|
||||
let json_to_file_layer = tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_target(false)
|
||||
.event_format(PgJsonLogShapeFormatter)
|
||||
.with_writer(file_logs_writer);
|
||||
(Some(json_to_file_layer), Some(_file_logs_guard))
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
// Initialize OpenTelemetry
|
||||
let provider =
|
||||
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default());
|
||||
@@ -35,12 +66,13 @@ pub fn init_tracing_and_logging(
|
||||
.with(env_filter)
|
||||
.with(otlp_layer)
|
||||
.with(fmt_layer)
|
||||
.with(json_to_file_layer)
|
||||
.init();
|
||||
tracing::info!("logging and tracing started");
|
||||
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
Ok(provider)
|
||||
Ok((provider, _file_logs_guard))
|
||||
}
|
||||
|
||||
/// Replace all newline characters with a special character to make it
|
||||
@@ -95,3 +127,157 @@ pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Track relevant id's
|
||||
const UNKNOWN_IDS: &str = r#""pg_instance_id": "", "pg_compute_id": """#;
|
||||
static IDS: LazyLock<RwLock<String>> = LazyLock::new(|| RwLock::new(UNKNOWN_IDS.to_string()));
|
||||
|
||||
pub fn update_ids(instance_id: &Option<String>, compute_id: &Option<String>) -> anyhow::Result<()> {
|
||||
let ids = format!(
|
||||
r#""pg_instance_id": "{}", "pg_compute_id": "{}""#,
|
||||
instance_id.as_ref().map(|s| s.as_str()).unwrap_or_default(),
|
||||
compute_id.as_ref().map(|s| s.as_str()).unwrap_or_default()
|
||||
);
|
||||
let mut guard = IDS
|
||||
.write()
|
||||
.map_err(|e| anyhow::anyhow!("Log set id's rwlock poisoned: {}", e))?;
|
||||
*guard = ids;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Massage compute_ctl logs into PG json log shape so we can use the same Lumberjack setup.
|
||||
struct PgJsonLogShapeFormatter;
|
||||
impl<S, N> fmt::format::FormatEvent<S, N> for PgJsonLogShapeFormatter
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
N: for<'a> fmt::format::FormatFields<'a> + 'static,
|
||||
{
|
||||
fn format_event(
|
||||
&self,
|
||||
ctx: &fmt::FmtContext<'_, S, N>,
|
||||
mut writer: fmt::format::Writer<'_>,
|
||||
event: &tracing::Event<'_>,
|
||||
) -> std::fmt::Result {
|
||||
// Format values from the event's metadata, and open message string
|
||||
let metadata = event.metadata();
|
||||
{
|
||||
let ids_guard = IDS.read();
|
||||
let ids = ids_guard
|
||||
.as_ref()
|
||||
.map(|guard| guard.as_str())
|
||||
// Surpress so that we don't lose all uploaded/ file logs if something goes super wrong. We would notice the missing id's.
|
||||
.unwrap_or(UNKNOWN_IDS);
|
||||
write!(
|
||||
&mut writer,
|
||||
r#"{{"timestamp": "{}", "error_severity": "{}", "file_name": "{}", "backend_type": "compute_ctl_self", {}, "message": "#,
|
||||
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f GMT"),
|
||||
metadata.level(),
|
||||
metadata.target(),
|
||||
ids
|
||||
)?;
|
||||
}
|
||||
|
||||
let mut message = String::new();
|
||||
let message_writer = fmt::format::Writer::new(&mut message);
|
||||
|
||||
// Gather the message
|
||||
ctx.field_format().format_fields(message_writer, event)?;
|
||||
|
||||
// TODO: any better options than to copy-paste this OSS span formatter?
|
||||
// impl<S, N, T> FormatEvent<S, N> for Format<Full, T>
|
||||
// https://docs.rs/tracing-subscriber/latest/tracing_subscriber/fmt/trait.FormatEvent.html#impl-FormatEvent%3CS,+N%3E-for-Format%3CFull,+T%3E
|
||||
|
||||
// write message, close bracket, and new line
|
||||
writeln!(writer, "{}}}", serde_json::to_string(&message).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use std::{cell::RefCell, io};
|
||||
|
||||
// Use thread_local! instead of Mutex for test isolation
|
||||
thread_local! {
|
||||
static WRITER_OUTPUT: RefCell<String> = const { RefCell::new(String::new()) };
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct StaticStringWriter;
|
||||
|
||||
impl io::Write for StaticStringWriter {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let output = String::from_utf8(buf.to_vec()).expect("Invalid UTF-8 in test output");
|
||||
WRITER_OUTPUT.with(|s| s.borrow_mut().push_str(&output));
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::MakeWriter<'_> for StaticStringWriter {
|
||||
type Writer = Self;
|
||||
|
||||
fn make_writer(&self) -> Self::Writer {
|
||||
Self
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_log_pg_json_shape_formatter() {
|
||||
// Use a scoped subscriber to prevent global state pollution
|
||||
let subscriber = tracing_subscriber::registry().with(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_ansi(false)
|
||||
.with_target(false)
|
||||
.event_format(PgJsonLogShapeFormatter)
|
||||
.with_writer(StaticStringWriter),
|
||||
);
|
||||
|
||||
let _ = update_ids(&Some("000".to_string()), &Some("111".to_string()));
|
||||
|
||||
// Clear any previous test state
|
||||
WRITER_OUTPUT.with(|s| s.borrow_mut().clear());
|
||||
|
||||
let messages = [
|
||||
"test message",
|
||||
r#"json escape check: name="BatchSpanProcessor.Flush.ExportError" reason="Other(reqwest::Error { kind: Request, url: \"http://localhost:4318/v1/traces\", source: hyper_
|
||||
util::client::legacy::Error(Connect, ConnectError(\"tcp connect error\", Os { code: 111, kind: ConnectionRefused, message: \"Connection refused\" })) })" Failed during the export process"#,
|
||||
];
|
||||
|
||||
tracing::subscriber::with_default(subscriber, || {
|
||||
for message in messages {
|
||||
tracing::info!(message);
|
||||
}
|
||||
});
|
||||
tracing::info!("not test message");
|
||||
|
||||
// Get captured output
|
||||
let output = WRITER_OUTPUT.with(|s| s.borrow().clone());
|
||||
|
||||
let json_strings: Vec<&str> = output.lines().collect();
|
||||
assert_eq!(
|
||||
json_strings.len(),
|
||||
messages.len(),
|
||||
"Log didn't have the expected number of json strings."
|
||||
);
|
||||
|
||||
let json_string_shape_regex = regex::Regex::new(
|
||||
r#"\{"timestamp": "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3} GMT", "error_severity": "INFO", "file_name": ".+", "backend_type": "compute_ctl_self", "pg_instance_id": "000", "pg_compute_id": "111", "message": ".+"\}"#
|
||||
).unwrap();
|
||||
|
||||
for (i, expected_message) in messages.iter().enumerate() {
|
||||
let json_string = json_strings[i];
|
||||
assert!(
|
||||
json_string_shape_regex.is_match(json_string),
|
||||
"Json log didn't match expected pattern:\n{json_string}",
|
||||
);
|
||||
let parsed_json: serde_json::Value = serde_json::from_str(json_string).unwrap();
|
||||
let actual_message = parsed_json["message"].as_str().unwrap();
|
||||
assert_eq!(*expected_message, actual_message);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,14 +4,13 @@ use std::thread;
|
||||
use std::time::{Duration, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use itertools::Itertools as _;
|
||||
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
|
||||
use pageserver_page_api as page_api;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use tracing::{info, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let (connstrings, auth) = {
|
||||
let (conninfo, auth) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
(
|
||||
spec.pageserver_connstr.clone(),
|
||||
spec.pageserver_conninfo.clone(),
|
||||
spec.storage_auth_token.clone(),
|
||||
)
|
||||
};
|
||||
|
||||
let result =
|
||||
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
@@ -112,35 +110,44 @@ fn acquire_lsn_lease_with_retry(
|
||||
|
||||
/// Tries to acquire LSN leases on all Pageserver shards.
|
||||
fn try_acquire_lsn_lease(
|
||||
connstrings: &str,
|
||||
conninfo: PageserverConnectionInfo,
|
||||
auth: Option<&str>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let connstrings = connstrings.split(',').collect_vec();
|
||||
let shard_count = connstrings.len();
|
||||
let mut leases = Vec::new();
|
||||
|
||||
for (shard_number, &connstring) in connstrings.iter().enumerate() {
|
||||
let tenant_shard_id = match shard_count {
|
||||
0 | 1 => TenantShardId::unsharded(tenant_id),
|
||||
shard_count => TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
},
|
||||
for (shard_index, shard) in conninfo.shards.into_iter() {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: shard_index.shard_number,
|
||||
shard_count: shard_index.shard_count,
|
||||
};
|
||||
|
||||
let lease = match PageserverProtocol::from_connstring(connstring)? {
|
||||
PageserverProtocol::Libpq => {
|
||||
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
PageserverProtocol::Grpc => {
|
||||
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
|
||||
}
|
||||
};
|
||||
leases.push(lease);
|
||||
// XXX: If there are more than pageserver for the one shard, do we need to get a
|
||||
// leas on all of them? Currently, that's what we assume, but this is hypothetical
|
||||
// as of this writing, as we never pass the info for more than one pageserver per
|
||||
// shard.
|
||||
for pageserver in shard.pageservers {
|
||||
let lease = match conninfo.prefer_protocol {
|
||||
PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
|
||||
&pageserver.grpc_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
|
||||
&pageserver.libpq_url.unwrap(),
|
||||
auth,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?,
|
||||
};
|
||||
leases.push(lease);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(leases.into_iter().min().flatten())
|
||||
|
||||
@@ -466,13 +466,7 @@ fn update_pgbouncer_ini(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
mut pgbouncer_config: IndexMap<String, String>,
|
||||
tls_config: Option<TlsConfig>,
|
||||
) -> Result<()> {
|
||||
async fn connect() -> Result<tokio_postgres::Client> {
|
||||
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
|
||||
// for VMs use pgbouncer specific way to connect to
|
||||
// pgbouncer admin console without password
|
||||
@@ -518,18 +512,17 @@ pub async fn tune_pgbouncer(
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(tls_config) = tls_config {
|
||||
// pgbouncer starts in a half-ok state if it cannot find these files.
|
||||
// It will default to client_tls_sslmode=deny, which causes proxy to error.
|
||||
// There is a small window at startup where these files don't yet exist in the VM.
|
||||
// Best to wait until it exists.
|
||||
loop {
|
||||
if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await {
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(500)).await
|
||||
}
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config to pgbouncer.ini
|
||||
/// 2. Notify pgbouncer to reload
|
||||
pub async fn tune_pgbouncer(
|
||||
mut pgbouncer_config: IndexMap<String, String>,
|
||||
tls_config: Option<TlsConfig>,
|
||||
) -> Result<()> {
|
||||
if let Some(tls_config) = tls_config {
|
||||
pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path);
|
||||
pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path);
|
||||
pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string());
|
||||
@@ -550,10 +543,17 @@ pub async fn tune_pgbouncer(
|
||||
|
||||
info!("Applying pgbouncer setting change");
|
||||
|
||||
reload_pgbouncer().await
|
||||
}
|
||||
|
||||
/// Reload pgbouncer.
|
||||
pub async fn reload_pgbouncer() -> Result<()> {
|
||||
let client = connect().await?;
|
||||
|
||||
if let Err(err) = client.simple_query("RELOAD").await {
|
||||
// Don't fail on error, just print it into log
|
||||
error!("Failed to apply pgbouncer setting change, {err}",);
|
||||
};
|
||||
error!("Failed to apply pgbouncer setting change: {err}",);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
30
compute_tools/src/pg_isready.rs
Normal file
30
compute_tools/src/pg_isready.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
use anyhow::{Context, anyhow};
|
||||
|
||||
// Run `/usr/local/bin/pg_isready -p {port}`
|
||||
// Check the connectivity of PG
|
||||
// Success means PG is listening on the port and accepting connections
|
||||
// Note that PG does not need to authenticate the connection, nor reserve a connection quota for it.
|
||||
// See https://www.postgresql.org/docs/current/app-pg-isready.html
|
||||
pub fn pg_isready(bin: &str, port: u16) -> anyhow::Result<()> {
|
||||
let child_result = std::process::Command::new(bin)
|
||||
.arg("-p")
|
||||
.arg(port.to_string())
|
||||
.spawn();
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| format!("could not run `{bin} --port {port}`"))
|
||||
}
|
||||
|
||||
// It's safe to assume pg_isready is under the same directory with postgres,
|
||||
// because it is a PG util bin installed along with postgres
|
||||
pub fn get_pg_isready_bin(pgbin: &str) -> String {
|
||||
let split = pgbin.split("/").collect::<Vec<&str>>();
|
||||
split[0..split.len() - 1].join("/") + "/pg_isready"
|
||||
}
|
||||
@@ -142,7 +142,7 @@ pub fn update_pg_hba(pgdata_path: &Path, databricks_pg_hba: Option<&String>) ->
|
||||
// Update pg_hba to contains databricks specfic settings before adding neon settings
|
||||
// PG uses the first record that matches to perform authentication, so we need to have
|
||||
// our rules before the default ones from neon.
|
||||
// See https://www.postgresql.org/docs/16/auth-pg-hba-conf.html
|
||||
// See https://www.postgresql.org/docs/current/auth-pg-hba-conf.html
|
||||
if let Some(databricks_pg_hba) = databricks_pg_hba {
|
||||
if config::line_in_file(
|
||||
&pghba_path,
|
||||
|
||||
@@ -13,17 +13,19 @@ use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
|
||||
use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState, create_databricks_roles};
|
||||
use crate::hadron_metrics::COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS;
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
|
||||
AddDatabricksGrants, AlterDatabricksRoles, CreateAndAlterDatabases, CreateAndAlterRoles,
|
||||
CreateAvailabilityCheck, CreateDatabricksMisc, CreateDatabricksRoles, CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
HandleDatabricksAuthExtension, HandleNeonExtension, HandleOtherExtensions,
|
||||
RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions,
|
||||
@@ -166,6 +168,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
[DropLogicalSubscriptions].to_vec(),
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -186,15 +189,33 @@ impl ComputeNode {
|
||||
};
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreatePrivilegedRole,
|
||||
let phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
] {
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
CreatePrivilegedRole,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
RenameAndDeleteDatabases,
|
||||
CreateAndAlterDatabases,
|
||||
CreateSchemaNeon,
|
||||
]
|
||||
};
|
||||
|
||||
for phase in phases {
|
||||
info!("Applying phase {:?}", &phase);
|
||||
apply_operations(
|
||||
params.clone(),
|
||||
@@ -203,6 +224,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -254,6 +276,7 @@ impl ComputeNode {
|
||||
concurrency_token.clone(),
|
||||
db,
|
||||
phases,
|
||||
self.params.lakebase_mode,
|
||||
);
|
||||
|
||||
Ok(tokio::spawn(fut))
|
||||
@@ -265,12 +288,28 @@ impl ComputeNode {
|
||||
handle.await??;
|
||||
}
|
||||
|
||||
let mut phases = vec![
|
||||
let mut phases = if self.params.lakebase_mode {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
]
|
||||
} else {
|
||||
vec![
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension, // This step depends on CreateSchemaNeon
|
||||
CreateAvailabilityCheck,
|
||||
DropRoles,
|
||||
];
|
||||
]
|
||||
};
|
||||
|
||||
// This step depends on CreateSchemaNeon
|
||||
if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
|
||||
@@ -303,6 +342,7 @@ impl ComputeNode {
|
||||
jwks_roles.clone(),
|
||||
phase,
|
||||
|| async { Ok(&client) },
|
||||
self.params.lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -328,6 +368,7 @@ impl ComputeNode {
|
||||
concurrency_token: Arc<tokio::sync::Semaphore>,
|
||||
db: DB,
|
||||
subphases: Vec<PerDatabasePhase>,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()> {
|
||||
let _permit = concurrency_token.acquire().await?;
|
||||
|
||||
@@ -355,6 +396,7 @@ impl ComputeNode {
|
||||
let client = client_conn.as_ref().unwrap();
|
||||
Ok(client)
|
||||
},
|
||||
lakebase_mode,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -477,6 +519,10 @@ pub enum PerDatabasePhase {
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreatePrivilegedRole,
|
||||
// BEGIN_HADRON
|
||||
CreateDatabricksRoles,
|
||||
AlterDatabricksRoles,
|
||||
// END_HADRON
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -489,7 +535,14 @@ pub enum ApplySpecPhase {
|
||||
DisablePostgresDBPgAudit,
|
||||
HandleOtherExtensions,
|
||||
HandleNeonExtension,
|
||||
// BEGIN_HADRON
|
||||
HandleDatabricksAuthExtension,
|
||||
// END_HADRON
|
||||
CreateAvailabilityCheck,
|
||||
// BEGIN_HADRON
|
||||
AddDatabricksGrants,
|
||||
CreateDatabricksMisc,
|
||||
// END_HADRON
|
||||
DropRoles,
|
||||
FinalizeDropLogicalSubscriptions,
|
||||
}
|
||||
@@ -525,6 +578,7 @@ pub async fn apply_operations<'a, Fut, F>(
|
||||
jwks_roles: Arc<HashSet<String>>,
|
||||
apply_spec_phase: ApplySpecPhase,
|
||||
client: F,
|
||||
lakebase_mode: bool,
|
||||
) -> Result<()>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
@@ -571,6 +625,23 @@ where
|
||||
},
|
||||
query
|
||||
);
|
||||
if !lakebase_mode {
|
||||
return res;
|
||||
}
|
||||
// BEGIN HADRON
|
||||
if let Err(e) = res.as_ref() {
|
||||
if let Some(sql_state) = e.code() {
|
||||
if sql_state.code() == "57014" {
|
||||
// SQL State 57014 (ERRCODE_QUERY_CANCELED) is used for statement timeouts.
|
||||
// Increment the counter whenever a statement timeout occurs. Timeouts on
|
||||
// this configuration path can only occur due to PS connectivity problems that
|
||||
// Postgres failed to recover from.
|
||||
COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
// END HADRON
|
||||
|
||||
res
|
||||
}
|
||||
.instrument(inspan)
|
||||
@@ -608,10 +679,44 @@ async fn get_operations<'a>(
|
||||
ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
|
||||
query: format!(
|
||||
include_str!("sql/create_privileged_role.sql"),
|
||||
privileged_role_name = params.privileged_role_name
|
||||
privileged_role_name = params.privileged_role_name,
|
||||
privileges = if params.lakebase_mode {
|
||||
"CREATEDB CREATEROLE NOLOGIN BYPASSRLS"
|
||||
} else {
|
||||
"CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS"
|
||||
}
|
||||
),
|
||||
comment: None,
|
||||
}))),
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phase
|
||||
ApplySpecPhase::CreateDatabricksRoles => {
|
||||
let queries = create_databricks_roles();
|
||||
let operations = queries.into_iter().map(|query| Operation {
|
||||
query,
|
||||
comment: None,
|
||||
});
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// Backfill existing databricks_reader_* roles with statement timeout from GUC
|
||||
ApplySpecPhase::AlterDatabricksRoles => {
|
||||
let query = String::from(include_str!(
|
||||
"sql/alter_databricks_reader_roles_timeout.sql"
|
||||
));
|
||||
|
||||
let operations = once(Operation {
|
||||
query,
|
||||
comment: Some(
|
||||
"Backfill existing databricks_reader_* roles with statement timeout"
|
||||
.to_string(),
|
||||
),
|
||||
});
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// End of new Hadron Phase
|
||||
// END_HADRON
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -981,7 +1086,10 @@ async fn get_operations<'a>(
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
),
|
||||
)
|
||||
// HADRON change:
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
// HADRON change end ,
|
||||
comment: None,
|
||||
},
|
||||
// This now will only drop privileges of the role
|
||||
@@ -1017,7 +1125,8 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
query: String::from(include_str!("sql/default_grants.sql")),
|
||||
query: String::from(include_str!("sql/default_grants.sql"))
|
||||
.replace("neon_superuser", ¶ms.privileged_role_name),
|
||||
comment: None,
|
||||
},
|
||||
]
|
||||
@@ -1086,6 +1195,28 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// BEGIN_HADRON
|
||||
// Note: we may want to version the extension someday, but for now we just drop it and recreate it.
|
||||
ApplySpecPhase::HandleDatabricksAuthExtension => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("DROP EXTENSION IF EXISTS databricks_auth"),
|
||||
comment: Some(String::from("dropping existing databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("CREATE EXTENSION databricks_auth"),
|
||||
comment: Some(String::from("creating databricks_auth extension")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from("GRANT SELECT ON databricks_auth_metrics TO pg_monitor"),
|
||||
comment: Some(String::from("grant select on databricks auth counters")),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// END_HADRON
|
||||
ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
|
||||
comment: None,
|
||||
@@ -1103,6 +1234,63 @@ async fn get_operations<'a>(
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
|
||||
// BEGIN_HADRON
|
||||
// New Hadron phases
|
||||
//
|
||||
// Grants permissions to roles that are used by Databricks.
|
||||
ApplySpecPhase::AddDatabricksGrants => {
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: String::from("GRANT USAGE ON SCHEMA neon TO databricks_monitor"),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute neon.* functions (in the postgres database)",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT, INSERT, UPDATE ON health_check TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from("Permissions needed for read and write probes")),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION pg_ls_dir(text) TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to monitor .snap file counts",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT SELECT ON neon.neon_perf_counters TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to access neon performance counters view",
|
||||
)),
|
||||
},
|
||||
Operation {
|
||||
query: String::from(
|
||||
"GRANT EXECUTE ON FUNCTION neon.get_perf_counters() TO databricks_monitor",
|
||||
),
|
||||
comment: Some(String::from(
|
||||
"Permissions needed to execute the underlying performance counters function",
|
||||
)),
|
||||
},
|
||||
]
|
||||
.into_iter();
|
||||
|
||||
Ok(Box::new(operations))
|
||||
}
|
||||
// Creates minor objects that are used by Databricks.
|
||||
ApplySpecPhase::CreateDatabricksMisc => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/create_databricks_misc.sql")),
|
||||
comment: Some(String::from(
|
||||
"The function databricks_monitor uses to convert exception to 0 or 1",
|
||||
)),
|
||||
}))),
|
||||
// End of new Hadron phases
|
||||
// END_HADRON
|
||||
ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
|
||||
query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
|
||||
comment: None,
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
DO $$
|
||||
DECLARE
|
||||
reader_role RECORD;
|
||||
timeout_value TEXT;
|
||||
BEGIN
|
||||
-- Get the current GUC setting for reader statement timeout
|
||||
SELECT current_setting('databricks.reader_statement_timeout', true) INTO timeout_value;
|
||||
|
||||
-- Only proceed if timeout_value is not null/empty and not '0' (disabled)
|
||||
IF timeout_value IS NOT NULL AND timeout_value != '' AND timeout_value != '0' THEN
|
||||
-- Find all databricks_reader_* roles and update their statement_timeout
|
||||
FOR reader_role IN
|
||||
SELECT r.rolname
|
||||
FROM pg_roles r
|
||||
WHERE r.rolname ~ '^databricks_reader_\d+$'
|
||||
LOOP
|
||||
-- Apply the timeout setting to the role (will overwrite existing setting)
|
||||
EXECUTE format('ALTER ROLE %I SET statement_timeout = %L',
|
||||
reader_role.rolname, timeout_value);
|
||||
|
||||
RAISE LOG 'Updated statement_timeout = % for role %', timeout_value, reader_role.rolname;
|
||||
END LOOP;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
15
compute_tools/src/sql/create_databricks_misc.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
ALTER ROLE databricks_monitor SET statement_timeout = '60s';
|
||||
|
||||
CREATE OR REPLACE FUNCTION health_check_write_succeeds()
|
||||
RETURNS INTEGER AS $$
|
||||
BEGIN
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();
|
||||
|
||||
RETURN 1;
|
||||
EXCEPTION WHEN OTHERS THEN
|
||||
RAISE EXCEPTION '[DATABRICKS_SMGR] health_check failed: [%] %', SQLSTATE, SQLERRM;
|
||||
RETURN 0;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -2,7 +2,7 @@ DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
|
||||
THEN
|
||||
CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
@@ -3,42 +3,43 @@ use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
|
||||
use anyhow::{Context, Result, bail};
|
||||
use compute_api::responses::TlsConfig;
|
||||
use ring::digest;
|
||||
use x509_cert::Certificate;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct CertDigest(digest::Digest);
|
||||
|
||||
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
|
||||
let mut digest = compute_digest(&cert_path).await;
|
||||
let (tx, rx) = tokio::sync::watch::channel(digest);
|
||||
tokio::spawn(async move {
|
||||
while !tx.is_closed() {
|
||||
let new_digest = compute_digest(&cert_path).await;
|
||||
if digest.0.as_ref() != new_digest.0.as_ref() {
|
||||
digest = new_digest;
|
||||
_ = tx.send(digest);
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(60)).await
|
||||
}
|
||||
});
|
||||
rx
|
||||
impl PartialEq for CertDigest {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.as_ref() == other.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
async fn compute_digest(cert_path: &str) -> CertDigest {
|
||||
pub fn wait_until_cert_changed(digest: CertDigest, cert_path: &str) -> CertDigest {
|
||||
loop {
|
||||
match try_compute_digest(cert_path).await {
|
||||
let new_digest = compute_digest(cert_path);
|
||||
if digest != new_digest {
|
||||
break new_digest;
|
||||
}
|
||||
|
||||
// Wait a while before checking the certificates.
|
||||
// We renew on a daily basis, so there's no rush.
|
||||
std::thread::sleep(Duration::from_secs(60));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compute_digest(cert_path: &str) -> CertDigest {
|
||||
loop {
|
||||
match try_compute_digest(cert_path) {
|
||||
Ok(d) => break d,
|
||||
Err(e) => {
|
||||
tracing::error!("could not read cert file {e:?}");
|
||||
tokio::time::sleep(Duration::from_secs(1)).await
|
||||
std::thread::sleep(Duration::from_secs(1))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
|
||||
let data = tokio::fs::read(cert_path).await?;
|
||||
fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
|
||||
let data = std::fs::read(cert_path)?;
|
||||
// sha256 is extremely collision resistent. can safely assume the digest to be unique
|
||||
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
|
||||
}
|
||||
@@ -46,28 +47,37 @@ async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
|
||||
pub const SERVER_CRT: &str = "server.crt";
|
||||
pub const SERVER_KEY: &str = "server.key";
|
||||
|
||||
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
|
||||
pub struct KeyPair {
|
||||
crt: String,
|
||||
key: String,
|
||||
}
|
||||
|
||||
pub fn load_certs_blocking(tls_config: &TlsConfig) -> KeyPair {
|
||||
loop {
|
||||
match try_update_key_path_blocking(pg_data, tls_config) {
|
||||
Ok(()) => break,
|
||||
match try_load_certs_blocking(tls_config) {
|
||||
Ok(key_pair) => break key_pair,
|
||||
Err(e) => {
|
||||
tracing::error!(error = ?e, "could not create key file");
|
||||
tracing::error!(error = ?e, "could not load certs");
|
||||
std::thread::sleep(Duration::from_secs(1))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Postgres requires the keypath be "secure". This means
|
||||
// 1. Owned by the postgres user.
|
||||
// 2. Have permission 600.
|
||||
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
|
||||
fn try_load_certs_blocking(tls_config: &TlsConfig) -> Result<KeyPair> {
|
||||
let key = std::fs::read_to_string(&tls_config.key_path)?;
|
||||
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
|
||||
|
||||
// to mitigate a race condition during renewal.
|
||||
verify_key_cert(&key, &crt)?;
|
||||
|
||||
Ok(KeyPair { key, crt })
|
||||
}
|
||||
|
||||
// Postgres requires the keypath be "secure". This means
|
||||
// 1. Owned by the postgres user.
|
||||
// 2. Have permission 600.
|
||||
pub fn update_key_path_blocking(pg_data: &Path, key_pair: &KeyPair) -> Result<()> {
|
||||
let mut key_file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
@@ -82,14 +92,22 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul
|
||||
.mode(0o600)
|
||||
.open(pg_data.join(SERVER_CRT))?;
|
||||
|
||||
key_file.write_all(key.as_bytes())?;
|
||||
crt_file.write_all(crt.as_bytes())?;
|
||||
// NOTE: We currently ensure that an explicit reload does not happen during TLS renewal, but
|
||||
// there's a chance that postgres/pgbouncer/local_proxy reloads implicitly halfway between
|
||||
// these writes. This could allow them to reads the wrong keys to the wrong certs.
|
||||
// There doesn't seem to be any way to prevent that. However, we will issue a reload shortly
|
||||
// after which should at least correct it.
|
||||
key_file.write_all(key_pair.key.as_bytes())?;
|
||||
crt_file.write_all(key_pair.crt.as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
|
||||
use x509_cert::Certificate;
|
||||
use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256;
|
||||
use x509_cert::der::oid::db::rfc8410::ID_ED_25519;
|
||||
use x509_cert::der::pem;
|
||||
|
||||
let certs = Certificate::load_pem_chain(cert.as_bytes())
|
||||
.context("decoding PEM encoded certificates")?;
|
||||
@@ -100,22 +118,30 @@ fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
|
||||
bail!("no certificates found");
|
||||
};
|
||||
|
||||
let pubkey = cert
|
||||
.tbs_certificate
|
||||
.subject_public_key_info
|
||||
.subject_public_key
|
||||
.raw_bytes();
|
||||
|
||||
match cert.signature_algorithm.oid {
|
||||
ECDSA_WITH_SHA_256 => {
|
||||
let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?;
|
||||
|
||||
let a = key.public_key().to_sec1_bytes();
|
||||
let b = cert
|
||||
.tbs_certificate
|
||||
.subject_public_key_info
|
||||
.subject_public_key
|
||||
.raw_bytes();
|
||||
|
||||
if *a != *b {
|
||||
if *key.public_key().to_sec1_bytes() != *pubkey {
|
||||
bail!("private key file does not match certificate")
|
||||
}
|
||||
}
|
||||
_ => bail!("unknown TLS key type"),
|
||||
ID_ED_25519 => {
|
||||
use ring::signature::{Ed25519KeyPair, KeyPair};
|
||||
|
||||
let (_, bytes) = pem::decode_vec(key.as_bytes())
|
||||
.map_err(|_| anyhow::anyhow!("invalid key encoding"))?;
|
||||
let key = Ed25519KeyPair::from_pkcs8_maybe_unchecked(&bytes).context("parse key")?;
|
||||
if *key.public_key().as_ref() != *pubkey {
|
||||
bail!("private key file does not match certificate")
|
||||
}
|
||||
}
|
||||
oid => bail!("unknown TLS key type: {oid}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -19,6 +19,9 @@ use compute_api::requests::ComputeClaimsScope;
|
||||
use compute_api::spec::{ComputeMode, PageserverProtocol};
|
||||
use control_plane::broker::StorageBroker;
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
|
||||
use control_plane::endpoint::{
|
||||
local_pageserver_conf_to_conn_info, tenant_locate_response_to_conn_info,
|
||||
};
|
||||
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::{
|
||||
@@ -44,7 +47,6 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::parse_host_port;
|
||||
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
@@ -52,7 +54,6 @@ use safekeeper_api::{
|
||||
};
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use tokio::task::JoinSet;
|
||||
use url::Host;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -560,7 +561,9 @@ enum EndpointCmd {
|
||||
Create(EndpointCreateCmdArgs),
|
||||
Start(EndpointStartCmdArgs),
|
||||
Reconfigure(EndpointReconfigureCmdArgs),
|
||||
RefreshConfiguration(EndpointRefreshConfigurationArgs),
|
||||
Stop(EndpointStopCmdArgs),
|
||||
UpdatePageservers(EndpointUpdatePageserversCmdArgs),
|
||||
GenerateJwt(EndpointGenerateJwtCmdArgs),
|
||||
}
|
||||
|
||||
@@ -721,6 +724,13 @@ struct EndpointReconfigureCmdArgs {
|
||||
safekeepers: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")]
|
||||
struct EndpointRefreshConfigurationArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
endpoint_id: String,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Stop an endpoint")]
|
||||
struct EndpointStopCmdArgs {
|
||||
@@ -738,6 +748,16 @@ struct EndpointStopCmdArgs {
|
||||
mode: EndpointTerminateMode,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Update the pageservers in the spec file of the compute endpoint")]
|
||||
struct EndpointUpdatePageserversCmdArgs {
|
||||
#[clap(help = "Postgres endpoint id")]
|
||||
endpoint_id: String,
|
||||
|
||||
#[clap(short = 'p', long, help = "Specified pageserver id")]
|
||||
pageserver_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(clap::Args)]
|
||||
#[clap(about = "Generate a JWT for an endpoint")]
|
||||
struct EndpointGenerateJwtCmdArgs {
|
||||
@@ -1069,7 +1089,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||
storage_controller: None,
|
||||
control_plane_hooks_api: None,
|
||||
generate_local_ssl_certs: false,
|
||||
generate_local_tls_certs: false,
|
||||
generate_compute_tls_certs: false,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1517,7 +1538,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
.ok_or_else(|| anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
|
||||
if !args.allow_multiple {
|
||||
cplane.check_conflicting_endpoints(
|
||||
@@ -1527,62 +1548,41 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
)?;
|
||||
}
|
||||
|
||||
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// fully managed by storage controller, therefore not sharded.
|
||||
(vec![pageserver], DEFAULT_STRIPE_SIZE)
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id).unwrap();
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = futures::future::try_join_all(
|
||||
locate_result.shards.into_iter().map(|shard| async move {
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
// Initialize LSN leases for static computes.
|
||||
assert!(!locate_result.shards.is_empty());
|
||||
|
||||
// Initialize LSN leases for static computes.
|
||||
if let ComputeMode::Static(lsn) = endpoint.mode {
|
||||
futures::future::try_join_all(locate_result.shards.iter().map(
|
||||
|shard| async move {
|
||||
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
|
||||
let pageserver = PageServerNode::from_env(env, conf);
|
||||
|
||||
pageserver
|
||||
.http_client
|
||||
.timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
|
||||
.await?;
|
||||
}
|
||||
.await
|
||||
},
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
|
||||
let pageserver = if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr)?,
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
};
|
||||
anyhow::Ok(pageserver)
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
let stripe_size = locate_result.shard_params.stripe_size;
|
||||
|
||||
(pageservers, stripe_size)
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
assert!(!pageservers.is_empty());
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
|
||||
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
|
||||
@@ -1612,9 +1612,8 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
endpoint_storage_addr,
|
||||
safekeepers_generation,
|
||||
safekeepers,
|
||||
pageservers,
|
||||
pageserver_conninfo,
|
||||
remote_ext_base_url: remote_ext_base_url.clone(),
|
||||
shard_stripe_size: stripe_size.0 as usize,
|
||||
create_test_user: args.create_test_user,
|
||||
start_timeout: args.start_timeout,
|
||||
autoprewarm: args.autoprewarm,
|
||||
@@ -1625,59 +1624,76 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(args).await?;
|
||||
}
|
||||
EndpointCmd::UpdatePageservers(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = match args.pageserver_id {
|
||||
Some(pageserver_id) => {
|
||||
let conf = env.get_pageserver_conf(pageserver_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
}
|
||||
None => {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result =
|
||||
storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
}
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
endpoint
|
||||
.update_pageservers_in_config(&pageserver_conninfo)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::Reconfigure(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
// Use gRPC if requested.
|
||||
let pageserver = if endpoint.grpc {
|
||||
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
(PageserverProtocol::Grpc, host, port)
|
||||
} else {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
(PageserverProtocol::Libpq, host, port)
|
||||
};
|
||||
vec![pageserver]
|
||||
|
||||
let prefer_protocol = if endpoint.grpc {
|
||||
PageserverProtocol::Grpc
|
||||
} else {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
// Use gRPC if requested.
|
||||
if endpoint.grpc {
|
||||
(
|
||||
PageserverProtocol::Grpc,
|
||||
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
|
||||
.expect("bad hostname"),
|
||||
shard.listen_grpc_port.expect("no gRPC port"),
|
||||
)
|
||||
} else {
|
||||
(
|
||||
PageserverProtocol::Libpq,
|
||||
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
PageserverProtocol::Libpq
|
||||
};
|
||||
let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id {
|
||||
let conf = env.get_pageserver_conf(ps_id)?;
|
||||
local_pageserver_conf_to_conn_info(conf)?
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
|
||||
tenant_locate_response_to_conn_info(&locate_result)?
|
||||
};
|
||||
pageserver_conninfo.prefer_protocol = prefer_protocol;
|
||||
|
||||
// If --safekeepers argument is given, use only the listed
|
||||
// safekeeper nodes; otherwise all from the env.
|
||||
let safekeepers = parse_safekeepers(&args.safekeepers)?;
|
||||
endpoint
|
||||
.reconfigure(Some(pageservers), None, safekeepers, None)
|
||||
.reconfigure(Some(&pageserver_conninfo), safekeepers, None)
|
||||
.await?;
|
||||
}
|
||||
EndpointCmd::RefreshConfiguration(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
endpoint.refresh_configuration().await?;
|
||||
}
|
||||
EndpointCmd::Stop(args) => {
|
||||
let endpoint_id = &args.endpoint_id;
|
||||
let endpoint = cplane
|
||||
|
||||
@@ -23,7 +23,7 @@ impl StorageBroker {
|
||||
}
|
||||
|
||||
pub fn initialize(&self) -> anyhow::Result<()> {
|
||||
if self.env.generate_local_ssl_certs {
|
||||
if self.env.generate_local_tls_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
&self.env.storage_broker_data_dir().join("server.crt"),
|
||||
&self.env.storage_broker_data_dir().join("server.key"),
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
//! <other PostgreSQL files>
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fmt::Display;
|
||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
@@ -54,12 +54,15 @@ use compute_api::requests::{
|
||||
};
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
|
||||
TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
|
||||
PgIdent, RemoteExtSpec, Role,
|
||||
PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
|
||||
};
|
||||
|
||||
// re-export these, because they're used in the reconfigure() function
|
||||
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
|
||||
|
||||
use jsonwebtoken::jwk::{
|
||||
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
|
||||
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
|
||||
@@ -74,9 +77,11 @@ use sha2::{Digest, Sha256};
|
||||
use spki::der::Decode;
|
||||
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
|
||||
use tracing::debug;
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::shard::ShardStripeSize;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
|
||||
use postgres_connection::parse_host_port;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -207,8 +212,13 @@ impl ComputeControlPlane {
|
||||
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
|
||||
let compute_ctl_config = ComputeCtlConfig {
|
||||
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
|
||||
tls: None::<TlsConfig>,
|
||||
tls: self.env.get_tls_config()?,
|
||||
};
|
||||
let mut features = vec![];
|
||||
if compute_ctl_config.tls.is_some() {
|
||||
features.push(ComputeFeature::TlsExperimental);
|
||||
}
|
||||
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
|
||||
@@ -235,7 +245,7 @@ impl ComputeControlPlane {
|
||||
drop_subscriptions_before_start,
|
||||
grpc,
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
features: features.clone(),
|
||||
cluster: None,
|
||||
compute_ctl_config: compute_ctl_config.clone(),
|
||||
privileged_role_name: privileged_role_name.clone(),
|
||||
@@ -257,7 +267,7 @@ impl ComputeControlPlane {
|
||||
skip_pg_catalog_updates,
|
||||
drop_subscriptions_before_start,
|
||||
reconfigure_concurrency: 1,
|
||||
features: vec![],
|
||||
features,
|
||||
cluster: None,
|
||||
compute_ctl_config,
|
||||
privileged_role_name,
|
||||
@@ -387,9 +397,8 @@ pub struct EndpointStartArgs {
|
||||
pub endpoint_storage_addr: String,
|
||||
pub safekeepers_generation: Option<SafekeeperGeneration>,
|
||||
pub safekeepers: Vec<NodeId>,
|
||||
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
pub pageserver_conninfo: PageserverConnectionInfo,
|
||||
pub remote_ext_base_url: Option<String>,
|
||||
pub shard_stripe_size: usize,
|
||||
pub create_test_user: bool,
|
||||
pub start_timeout: Duration,
|
||||
pub autoprewarm: bool,
|
||||
@@ -662,14 +671,6 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
|
||||
pageservers
|
||||
.iter()
|
||||
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
@@ -715,9 +716,6 @@ impl Endpoint {
|
||||
std::fs::remove_dir_all(self.pgdata())?;
|
||||
}
|
||||
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
@@ -732,6 +730,44 @@ impl Endpoint {
|
||||
remote_extensions = None;
|
||||
};
|
||||
|
||||
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
|
||||
//
|
||||
// XXX: I believe this is not really needed, except to make
|
||||
// test_forward_compatibility happy.
|
||||
//
|
||||
// Use a closure so that we can conviniently return None in the middle of the
|
||||
// loop.
|
||||
let pageserver_connstring: Option<String> = (|| {
|
||||
let num_shards = args.pageserver_conninfo.shard_count.count();
|
||||
let mut connstrings = Vec::new();
|
||||
for shard_no in 0..num_shards {
|
||||
let shard_index = ShardIndex {
|
||||
shard_count: args.pageserver_conninfo.shard_count,
|
||||
shard_number: ShardNumber(shard_no),
|
||||
};
|
||||
let shard = args
|
||||
.pageserver_conninfo
|
||||
.shards
|
||||
.get(&shard_index)
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"shard {} not found in pageserver_connection_info",
|
||||
shard_index
|
||||
)
|
||||
})?;
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow!("must have at least one pageserver"))?;
|
||||
if let Some(libpq_url) = &pageserver.libpq_url {
|
||||
connstrings.push(libpq_url.clone());
|
||||
} else {
|
||||
return Ok::<_, anyhow::Error>(None);
|
||||
}
|
||||
}
|
||||
Ok(Some(connstrings.join(",")))
|
||||
})()?;
|
||||
|
||||
// Create config file
|
||||
let config = {
|
||||
let mut spec = ComputeSpec {
|
||||
@@ -776,13 +812,14 @@ impl Endpoint {
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
|
||||
pageserver_connstring,
|
||||
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: args.auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(args.shard_stripe_size),
|
||||
shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
|
||||
local_proxy_config: None,
|
||||
reconfigure_concurrency: self.reconfigure_concurrency,
|
||||
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
|
||||
@@ -793,6 +830,7 @@ impl Endpoint {
|
||||
autoprewarm: args.autoprewarm,
|
||||
offload_lfc_interval_seconds: args.offload_lfc_interval_seconds,
|
||||
suspend_timeout_seconds: -1, // Only used in neon_local.
|
||||
databricks_settings: None,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
@@ -919,7 +957,7 @@ impl Endpoint {
|
||||
}
|
||||
// keep retrying
|
||||
}
|
||||
ComputeStatus::Running => {
|
||||
ComputeStatus::Reloading | ComputeStatus::Running => {
|
||||
// All good!
|
||||
break;
|
||||
}
|
||||
@@ -937,7 +975,9 @@ impl Endpoint {
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPendingFast
|
||||
| ComputeStatus::TerminationPendingImmediate
|
||||
| ComputeStatus::Terminated => {
|
||||
| ComputeStatus::Terminated
|
||||
| ComputeStatus::RefreshConfigurationPending
|
||||
| ComputeStatus::RefreshConfiguration => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
}
|
||||
@@ -960,6 +1000,27 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Update the pageservers in the spec file of the endpoint. This is useful to test the spec refresh scenario.
|
||||
pub async fn update_pageservers_in_config(
|
||||
&self,
|
||||
pageserver_conninfo: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
let config_path = self.endpoint_path().join("config.json");
|
||||
let mut config: ComputeConfig = {
|
||||
let file = std::fs::File::open(&config_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let mut spec = config.spec.unwrap();
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
config.spec = Some(spec);
|
||||
|
||||
let file = std::fs::File::create(&config_path)?;
|
||||
serde_json::to_writer_pretty(file, &config)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Call the /status HTTP API
|
||||
pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
|
||||
let client = reqwest::Client::new();
|
||||
@@ -994,8 +1055,7 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure(
|
||||
&self,
|
||||
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageserver_conninfo: Option<&PageserverConnectionInfo>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
safekeeper_generation: Option<SafekeeperGeneration>,
|
||||
) -> Result<()> {
|
||||
@@ -1010,15 +1070,15 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If pageservers are not specified, don't change them.
|
||||
if let Some(pageservers) = pageservers {
|
||||
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
|
||||
|
||||
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
|
||||
spec.pageserver_connstring = Some(pageserver_connstr);
|
||||
if stripe_size.is_some() {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
if let Some(pageserver_conninfo) = pageserver_conninfo {
|
||||
// If pageservers are provided, we need to ensure that they are not empty.
|
||||
// This is a requirement for the compute_ctl configuration.
|
||||
anyhow::ensure!(
|
||||
!pageserver_conninfo.shards.is_empty(),
|
||||
"no pageservers provided"
|
||||
);
|
||||
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
|
||||
spec.shard_stripe_size = pageserver_conninfo.stripe_size;
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
@@ -1067,11 +1127,9 @@ impl Endpoint {
|
||||
|
||||
pub async fn reconfigure_pageservers(
|
||||
&self,
|
||||
pageservers: Vec<(PageserverProtocol, Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
pageservers: &PageserverConnectionInfo,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(Some(pageservers), stripe_size, None, None)
|
||||
.await
|
||||
self.reconfigure(Some(pageservers), None, None).await
|
||||
}
|
||||
|
||||
pub async fn reconfigure_safekeepers(
|
||||
@@ -1079,7 +1137,7 @@ impl Endpoint {
|
||||
safekeepers: Vec<NodeId>,
|
||||
generation: SafekeeperGeneration,
|
||||
) -> Result<()> {
|
||||
self.reconfigure(None, None, Some(safekeepers), Some(generation))
|
||||
self.reconfigure(None, Some(safekeepers), Some(generation))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1125,6 +1183,33 @@ impl Endpoint {
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub async fn refresh_configuration(&self) -> Result<()> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.unwrap();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/refresh_configuration",
|
||||
self.internal_http_address.ip(),
|
||||
self.internal_http_address.port()
|
||||
))
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text().await {
|
||||
Ok(err_body) => format!("Error: {err_body}"),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
Err(anyhow::anyhow!(msg))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
||||
format!(
|
||||
"postgresql://{}@{}:{}/{}",
|
||||
@@ -1135,3 +1220,84 @@ impl Endpoint {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
/// fully managed by storage controller, therefore not sharded.
|
||||
pub fn local_pageserver_conf_to_conn_info(
|
||||
conf: &crate::local_env::PageServerConf,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let libpq_url = {
|
||||
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
|
||||
let port = port.unwrap_or(5432);
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
|
||||
let (host, port) = parse_host_port(grpc_addr)?;
|
||||
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let ps_conninfo = PageserverShardConnectionInfo {
|
||||
id: Some(conf.id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![ps_conninfo],
|
||||
};
|
||||
|
||||
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: ShardCount::unsharded(),
|
||||
stripe_size: None,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn tenant_locate_response_to_conn_info(
|
||||
response: &pageserver_api::controller_api::TenantLocateResponse,
|
||||
) -> Result<PageserverConnectionInfo> {
|
||||
let mut shards = HashMap::new();
|
||||
for shard in response.shards.iter() {
|
||||
tracing::info!("parsing {}", shard.listen_pg_addr);
|
||||
let libpq_url = {
|
||||
let host = &shard.listen_pg_addr;
|
||||
let port = shard.listen_pg_port;
|
||||
Some(format!("postgres://no_user@{host}:{port}"))
|
||||
};
|
||||
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
|
||||
let host = grpc_addr;
|
||||
let port = shard.listen_grpc_port.expect("no gRPC port");
|
||||
Some(format!("grpc://no_user@{host}:{port}"))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let shard_info = PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: Some(shard.node_id),
|
||||
libpq_url,
|
||||
grpc_url,
|
||||
}],
|
||||
};
|
||||
|
||||
shards.insert(shard.shard_id.to_index(), shard_info);
|
||||
}
|
||||
|
||||
let stripe_size = if response.shard_params.count.is_unsharded() {
|
||||
None
|
||||
} else {
|
||||
Some(response.shard_params.stripe_size)
|
||||
};
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count: response.shard_params.count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ use std::{env, fs};
|
||||
|
||||
use anyhow::{Context, bail};
|
||||
use clap::ValueEnum;
|
||||
use compute_api::responses::TlsConfig;
|
||||
use pageserver_api::config::PostHogConfig;
|
||||
use pem::Pem;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -95,7 +96,10 @@ pub struct LocalEnv {
|
||||
|
||||
/// Flag to generate SSL certificates for components that need it.
|
||||
/// Also generates root CA certificate that is used to sign all other certificates.
|
||||
pub generate_local_ssl_certs: bool,
|
||||
pub generate_local_tls_certs: bool,
|
||||
|
||||
/// Flag to generate SSL certificates for compute.
|
||||
pub generate_compute_tls_certs: bool,
|
||||
}
|
||||
|
||||
/// On-disk state stored in `.neon/config`.
|
||||
@@ -123,7 +127,11 @@ pub struct OnDiskConfig {
|
||||
// Note: skip serializing because in compat tests old storage controller fails
|
||||
// to load new config file. May be removed after this field is in release branch.
|
||||
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
||||
pub generate_local_ssl_certs: bool,
|
||||
pub generate_local_tls_certs: bool,
|
||||
// Note: skip serializing because in compat tests old storage controller fails
|
||||
// to load new config file. May be removed after this field is in release branch.
|
||||
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
||||
pub generate_compute_tls_certs: bool,
|
||||
}
|
||||
|
||||
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
|
||||
@@ -152,7 +160,8 @@ pub struct NeonLocalInitConf {
|
||||
pub endpoint_storage: EndpointStorageConf,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_hooks_api: Option<Url>,
|
||||
pub generate_local_ssl_certs: bool,
|
||||
pub generate_local_tls_certs: bool,
|
||||
pub generate_compute_tls_certs: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
@@ -511,7 +520,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
if self.generate_local_tls_certs {
|
||||
Some(self.base_data_dir.join("rootCA.crt"))
|
||||
} else {
|
||||
None
|
||||
@@ -519,7 +528,7 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
if self.generate_local_tls_certs {
|
||||
Some(self.base_data_dir.join("rootCA.key"))
|
||||
} else {
|
||||
None
|
||||
@@ -545,6 +554,33 @@ impl LocalEnv {
|
||||
)
|
||||
}
|
||||
|
||||
fn compute_ssl_paths(&self) -> Option<(PathBuf, PathBuf)> {
|
||||
if self.generate_compute_tls_certs {
|
||||
Some((
|
||||
self.base_data_dir.join("compute_server.crt"),
|
||||
self.base_data_dir.join("compute_server.key"),
|
||||
))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_compute_ssl_cert(&self) -> anyhow::Result<()> {
|
||||
self.generate_ssl_ca_cert()?;
|
||||
|
||||
let (cert_path, key_path) = self.compute_ssl_paths().unwrap();
|
||||
if !fs::exists(&cert_path)? {
|
||||
generate_ssl_cert(
|
||||
&cert_path,
|
||||
&key_path,
|
||||
self.ssl_ca_cert_path().unwrap().as_path(),
|
||||
self.ssl_ca_key_path().unwrap().as_path(),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Creates HTTP client with local SSL CA certificates.
|
||||
pub fn create_http_client(&self) -> reqwest::Client {
|
||||
let ssl_ca_certs = self.ssl_ca_cert_path().map(|ssl_ca_file| {
|
||||
@@ -673,7 +709,8 @@ impl LocalEnv {
|
||||
control_plane_hooks_api,
|
||||
control_plane_compute_hook_api: _,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
generate_local_tls_certs,
|
||||
generate_compute_tls_certs,
|
||||
endpoint_storage,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
@@ -690,7 +727,8 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
generate_local_tls_certs,
|
||||
generate_compute_tls_certs,
|
||||
endpoint_storage,
|
||||
}
|
||||
};
|
||||
@@ -806,7 +844,8 @@ impl LocalEnv {
|
||||
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
|
||||
control_plane_compute_hook_api: None,
|
||||
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||
generate_local_ssl_certs: self.generate_local_ssl_certs,
|
||||
generate_local_tls_certs: self.generate_local_tls_certs,
|
||||
generate_compute_tls_certs: self.generate_compute_tls_certs,
|
||||
endpoint_storage: self.endpoint_storage.clone(),
|
||||
},
|
||||
)
|
||||
@@ -861,6 +900,21 @@ impl LocalEnv {
|
||||
Ok(pem)
|
||||
}
|
||||
|
||||
/// Get the TLS config if set.
|
||||
pub fn get_tls_config(&self) -> anyhow::Result<Option<TlsConfig>> {
|
||||
match self.compute_ssl_paths() {
|
||||
Some((cert_path, key_path)) => {
|
||||
self.generate_compute_ssl_cert()?;
|
||||
|
||||
Ok(Some(TlsConfig {
|
||||
key_path: key_path.to_str().context("utf8")?.to_string(),
|
||||
cert_path: cert_path.to_str().context("utf8")?.to_string(),
|
||||
}))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
|
||||
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
|
||||
let base_path = base_path();
|
||||
@@ -912,7 +966,8 @@ impl LocalEnv {
|
||||
pageservers,
|
||||
safekeepers,
|
||||
control_plane_api,
|
||||
generate_local_ssl_certs,
|
||||
generate_local_tls_certs,
|
||||
generate_compute_tls_certs,
|
||||
control_plane_hooks_api,
|
||||
endpoint_storage,
|
||||
} = conf;
|
||||
@@ -965,13 +1020,17 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_hooks_api,
|
||||
branch_name_mappings: Default::default(),
|
||||
generate_local_ssl_certs,
|
||||
generate_local_tls_certs,
|
||||
generate_compute_tls_certs,
|
||||
endpoint_storage,
|
||||
};
|
||||
|
||||
if generate_local_ssl_certs {
|
||||
if generate_local_tls_certs {
|
||||
env.generate_ssl_ca_cert()?;
|
||||
}
|
||||
if generate_compute_tls_certs {
|
||||
env.generate_compute_ssl_cert()?;
|
||||
}
|
||||
|
||||
// create endpoints dir
|
||||
fs::create_dir_all(env.endpoints_path())?;
|
||||
|
||||
@@ -241,7 +241,7 @@ impl PageServerNode {
|
||||
.context("write identity toml")?;
|
||||
drop(identity_toml);
|
||||
|
||||
if self.env.generate_local_ssl_certs {
|
||||
if self.env.generate_local_tls_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
datadir.join("server.crt").as_path(),
|
||||
datadir.join("server.key").as_path(),
|
||||
|
||||
@@ -102,7 +102,7 @@ impl SafekeeperNode {
|
||||
/// Initializes a safekeeper node by creating all necessary files,
|
||||
/// e.g. SSL certificates and JWT token file.
|
||||
pub fn initialize(&self) -> anyhow::Result<()> {
|
||||
if self.env.generate_local_ssl_certs {
|
||||
if self.env.generate_local_tls_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
&self.datadir_path().join("server.crt"),
|
||||
&self.datadir_path().join("server.key"),
|
||||
|
||||
@@ -353,7 +353,7 @@ impl StorageController {
|
||||
}
|
||||
}
|
||||
|
||||
if self.env.generate_local_ssl_certs {
|
||||
if self.env.generate_local_tls_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
&instance_dir.join("server.crt"),
|
||||
&instance_dir.join("server.key"),
|
||||
|
||||
@@ -27,7 +27,6 @@ pub struct ComputeConfig {
|
||||
pub spec: Option<ComputeSpec>,
|
||||
|
||||
/// The compute_ctl configuration
|
||||
#[allow(dead_code)]
|
||||
pub compute_ctl_config: ComputeCtlConfig,
|
||||
}
|
||||
|
||||
@@ -108,11 +107,10 @@ pub enum PromoteState {
|
||||
Failed { error: String },
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Default, Debug, Clone)]
|
||||
#[derive(Deserialize, Default, Debug)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
/// Result of /safekeepers_lsn
|
||||
pub struct SafekeepersLsn {
|
||||
pub safekeepers: String,
|
||||
pub struct PromoteConfig {
|
||||
pub spec: ComputeSpec,
|
||||
pub wal_flush_lsn: utils::lsn::Lsn,
|
||||
}
|
||||
|
||||
@@ -156,6 +154,8 @@ pub enum ComputeStatus {
|
||||
Empty,
|
||||
// Compute configuration was requested.
|
||||
ConfigurationPending,
|
||||
// Postgres, pgbouncer, and local_proxy is currently being reloaded.
|
||||
Reloading,
|
||||
// Compute node has spec and initial startup and
|
||||
// configuration is in progress.
|
||||
Init,
|
||||
@@ -173,6 +173,11 @@ pub enum ComputeStatus {
|
||||
TerminationPendingImmediate,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
// A spec refresh is being requested
|
||||
RefreshConfigurationPending,
|
||||
// A spec refresh is being applied. We cannot refresh configuration again until the current
|
||||
// refresh is done, i.e., signal_refresh_configuration() will return 500 error.
|
||||
RefreshConfiguration,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
@@ -185,6 +190,11 @@ impl Display for ComputeStatus {
|
||||
match self {
|
||||
ComputeStatus::Empty => f.write_str("empty"),
|
||||
ComputeStatus::ConfigurationPending => f.write_str("configuration-pending"),
|
||||
ComputeStatus::Reloading => f.write_str("reloading"),
|
||||
ComputeStatus::RefreshConfiguration => f.write_str("refresh-configuration"),
|
||||
ComputeStatus::RefreshConfigurationPending => {
|
||||
f.write_str("refresh-configuration-pending")
|
||||
}
|
||||
ComputeStatus::Init => f.write_str("init"),
|
||||
ComputeStatus::Running => f.write_str("running"),
|
||||
ComputeStatus::Configuration => f.write_str("configuration"),
|
||||
|
||||
@@ -12,8 +12,9 @@ use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use url::Url;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
use crate::responses::TlsConfig;
|
||||
|
||||
@@ -105,8 +106,27 @@ pub struct ComputeSpec {
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Pageserver information can be passed in three different ways:
|
||||
/// 1. Here in `pageserver_connection_info`
|
||||
/// 2. In the `pageserver_connstring` field.
|
||||
/// 3. in `cluster.settings`.
|
||||
///
|
||||
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
|
||||
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
|
||||
/// `pageserver_connection_info` field is missing.
|
||||
///
|
||||
/// If both `pageserver_connection_info` and `pageserver_connstring`+`shard_stripe_size` are
|
||||
/// given, they must contain the same information.
|
||||
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
/// Stripe size for pageserver sharding, in pages. This is set together with the legacy
|
||||
/// `pageserver_connstring` field. When the modern `pageserver_connection_info` field is used,
|
||||
/// the stripe size is stored in `pageserver_connection_info.stripe_size` instead.
|
||||
pub shard_stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
@@ -139,10 +159,6 @@ pub struct ComputeSpec {
|
||||
|
||||
pub pgbouncer_settings: Option<IndexMap<String, String>>,
|
||||
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
/// Local Proxy configuration used for JWT authentication
|
||||
#[serde(default)]
|
||||
pub local_proxy_config: Option<LocalProxySpec>,
|
||||
@@ -193,6 +209,9 @@ pub struct ComputeSpec {
|
||||
///
|
||||
/// We use this value to derive other values, such as the installed extensions metric.
|
||||
pub suspend_timeout_seconds: i64,
|
||||
|
||||
// Databricks specific options for compute instance.
|
||||
pub databricks_settings: Option<DatabricksSettings>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -214,6 +233,140 @@ pub enum ComputeFeature {
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverConnectionInfo {
|
||||
/// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
|
||||
pub shard_count: ShardCount,
|
||||
|
||||
/// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
|
||||
pub stripe_size: Option<ShardStripeSize>,
|
||||
|
||||
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
|
||||
|
||||
/// If the compute supports both protocols, this indicates which one it should use. The compute
|
||||
/// may use other available protocols too, if it doesn't support the preferred one. The URL's
|
||||
/// for the protocol specified here must be present for all shards, i.e. do not mark a protocol
|
||||
/// as preferred if it cannot actually be used with all the pageservers.
|
||||
#[serde(default)]
|
||||
pub prefer_protocol: PageserverProtocol,
|
||||
}
|
||||
|
||||
/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
|
||||
///
|
||||
/// This is used for backwards-compatibility, to parse the legacy
|
||||
/// [ComputeSpec::pageserver_connstring] field, or the 'neon.pageserver_connstring' GUC. Nowadays,
|
||||
/// the 'pageserver_connection_info' field should be used instead.
|
||||
impl PageserverConnectionInfo {
|
||||
pub fn from_connstr(
|
||||
connstr: &str,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> Result<PageserverConnectionInfo, anyhow::Error> {
|
||||
let shard_infos: Vec<_> = connstr
|
||||
.split(',')
|
||||
.map(|connstr| PageserverShardInfo {
|
||||
pageservers: vec![PageserverShardConnectionInfo {
|
||||
id: None,
|
||||
libpq_url: Some(connstr.to_string()),
|
||||
grpc_url: None,
|
||||
}],
|
||||
})
|
||||
.collect();
|
||||
|
||||
match shard_infos.len() {
|
||||
0 => anyhow::bail!("empty connection string"),
|
||||
1 => {
|
||||
// We assume that if there's only connection string, it means "unsharded",
|
||||
// rather than a sharded system with just a single shard. The latter is
|
||||
// possible in principle, but we never do it.
|
||||
let shard_count = ShardCount::unsharded();
|
||||
let only_shard = shard_infos.first().unwrap().clone();
|
||||
let shards = vec![(ShardIndex::unsharded(), only_shard)];
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size: None,
|
||||
shards: shards.into_iter().collect(),
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
n => {
|
||||
if stripe_size.is_none() {
|
||||
anyhow::bail!("{n} shards but no stripe_size");
|
||||
}
|
||||
let shard_count = ShardCount(n.try_into()?);
|
||||
let shards = shard_infos
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, shard_info)| {
|
||||
(
|
||||
ShardIndex {
|
||||
shard_count,
|
||||
shard_number: ShardNumber(
|
||||
idx.try_into().expect("shard number fits in u8"),
|
||||
),
|
||||
},
|
||||
shard_info,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Ok(PageserverConnectionInfo {
|
||||
shard_count,
|
||||
stripe_size,
|
||||
shards,
|
||||
prefer_protocol: PageserverProtocol::Libpq,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience routine to get the connection string for a shard.
|
||||
pub fn shard_url(
|
||||
&self,
|
||||
shard_number: ShardNumber,
|
||||
protocol: PageserverProtocol,
|
||||
) -> anyhow::Result<&str> {
|
||||
let shard_index = ShardIndex {
|
||||
shard_number,
|
||||
shard_count: self.shard_count,
|
||||
};
|
||||
let shard = self.shards.get(&shard_index).ok_or_else(|| {
|
||||
anyhow::anyhow!("shard connection info missing for shard {}", shard_index)
|
||||
})?;
|
||||
|
||||
// Just use the first pageserver in the list. That's good enough for this
|
||||
// convenience routine; if you need more control, like round robin policy or
|
||||
// failover support, roll your own. (As of this writing, we never have more than
|
||||
// one pageserver per shard anyway, but that will change in the future.)
|
||||
let pageserver = shard
|
||||
.pageservers
|
||||
.first()
|
||||
.ok_or(anyhow::anyhow!("must have at least one pageserver"))?;
|
||||
|
||||
let result = match protocol {
|
||||
PageserverProtocol::Grpc => pageserver
|
||||
.grpc_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no grpc_url for shard {shard_index}"))?,
|
||||
PageserverProtocol::Libpq => pageserver
|
||||
.libpq_url
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("no libpq_url for shard {shard_index}"))?,
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardInfo {
|
||||
pub pageservers: Vec<PageserverShardConnectionInfo>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
|
||||
pub struct PageserverShardConnectionInfo {
|
||||
pub id: Option<NodeId>,
|
||||
pub libpq_url: Option<String>,
|
||||
pub grpc_url: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -331,6 +484,12 @@ impl ComputeMode {
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for ComputeMode {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.to_type_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
@@ -467,13 +626,15 @@ pub struct JwksSettings {
|
||||
pub jwt_audience: Option<String>,
|
||||
}
|
||||
|
||||
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
|
||||
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||
/// Protocol used to connect to a Pageserver.
|
||||
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
pub enum PageserverProtocol {
|
||||
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
|
||||
#[default]
|
||||
#[serde(rename = "libpq")]
|
||||
Libpq,
|
||||
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
|
||||
#[serde(rename = "grpc")]
|
||||
Grpc,
|
||||
}
|
||||
|
||||
|
||||
@@ -558,11 +558,11 @@ async fn add_request_id_header_to_response(
|
||||
mut res: Response<Body>,
|
||||
req_info: RequestInfo,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if let Some(request_id) = req_info.context::<RequestId>() {
|
||||
if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
|
||||
res.headers_mut()
|
||||
.insert(&X_REQUEST_ID_HEADER, request_header_value);
|
||||
};
|
||||
if let Some(request_id) = req_info.context::<RequestId>()
|
||||
&& let Ok(request_header_value) = HeaderValue::from_str(&request_id.0)
|
||||
{
|
||||
res.headers_mut()
|
||||
.insert(&X_REQUEST_ID_HEADER, request_header_value);
|
||||
};
|
||||
|
||||
Ok(res)
|
||||
|
||||
@@ -72,10 +72,10 @@ impl Server {
|
||||
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
|
||||
return true;
|
||||
}
|
||||
if let Some(inner) = err.source() {
|
||||
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
|
||||
return suppress_io_error(io);
|
||||
}
|
||||
if let Some(inner) = err.source()
|
||||
&& let Some(io) = inner.downcast_ref::<std::io::Error>()
|
||||
{
|
||||
return suppress_io_error(io);
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
@@ -129,6 +129,12 @@ impl<L: LabelGroup> InfoMetric<L> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: LabelGroup + Default> Default for InfoMetric<L, GaugeState> {
|
||||
fn default() -> Self {
|
||||
InfoMetric::new(L::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
|
||||
pub fn with_metric(label: L, metric: M) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -363,7 +363,7 @@ where
|
||||
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
|
||||
// _slowly_ iterate through all buckets with its clock hand, without holding a lock.
|
||||
// If we switch to an Iterator, it must not hold the lock.
|
||||
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
|
||||
pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<'_, (K, V)>> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
|
||||
if pos >= map.buckets.len() {
|
||||
return None;
|
||||
|
||||
@@ -1500,6 +1500,7 @@ pub struct TimelineArchivalConfigRequest {
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct TimelinePatchIndexPartRequest {
|
||||
pub rel_size_migration: Option<RelSizeMigration>,
|
||||
pub rel_size_migrated_at: Option<Lsn>,
|
||||
pub gc_compaction_last_completed_lsn: Option<Lsn>,
|
||||
pub applied_gc_cutoff_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
@@ -1533,10 +1534,10 @@ pub enum RelSizeMigration {
|
||||
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
|
||||
Legacy,
|
||||
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
|
||||
/// persisted in the index part. The read path will read both formats and merge them.
|
||||
/// persisted in the storage. The read path will read both formats and validate them.
|
||||
Migrating,
|
||||
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
|
||||
/// in the index part, and the read path will not read the old format.
|
||||
/// in the storage, and the read path will not read the old format.
|
||||
Migrated,
|
||||
}
|
||||
|
||||
@@ -1619,6 +1620,7 @@ pub struct TimelineInfo {
|
||||
|
||||
/// The status of the rel_size migration.
|
||||
pub rel_size_migration: Option<RelSizeMigration>,
|
||||
pub rel_size_migrated_at: Option<Lsn>,
|
||||
|
||||
/// Whether the timeline is invisible in synthetic size calculations.
|
||||
pub is_invisible: Option<bool>,
|
||||
|
||||
@@ -9,10 +9,7 @@ regex.workspace = true
|
||||
bytes.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
criterion.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
memoffset.workspace = true
|
||||
pprof.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -22,6 +19,7 @@ tracing.workspace = true
|
||||
postgres_versioninfo.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
|
||||
@@ -34,9 +34,8 @@ const SIZEOF_CONTROLDATA: usize = size_of::<ControlFileData>();
|
||||
impl ControlFileData {
|
||||
/// Compute the offset of the `crc` field within the `ControlFileData` struct.
|
||||
/// Equivalent to offsetof(ControlFileData, crc) in C.
|
||||
// Someday this can be const when the right compiler features land.
|
||||
fn pg_control_crc_offset() -> usize {
|
||||
memoffset::offset_of!(ControlFileData, crc)
|
||||
const fn pg_control_crc_offset() -> usize {
|
||||
std::mem::offset_of!(ControlFileData, crc)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
@@ -4,12 +4,11 @@
|
||||
use crate::pg_constants;
|
||||
use crate::transaction_id_precedes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
use super::bindings::MultiXactId;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
tracing::trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
@@ -14,7 +14,6 @@ use super::xlog_utils::*;
|
||||
use crate::WAL_SEGMENT_SIZE;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::num::NonZeroU32;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -236,7 +235,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder {
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
let next_lsn = if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
tracing::trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64)
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
|
||||
@@ -23,8 +23,6 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::{CString, OsStr};
|
||||
use std::fs::File;
|
||||
@@ -235,7 +233,7 @@ pub fn find_end_of_wal(
|
||||
let mut curr_lsn = start_lsn;
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let pg_version = MY_PGVERSION;
|
||||
debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
tracing::debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
|
||||
|
||||
@@ -247,7 +245,7 @@ pub fn find_end_of_wal(
|
||||
match open_wal_segment(&seg_file_path)? {
|
||||
None => {
|
||||
// no more segments
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
|
||||
result, seg_file_path
|
||||
);
|
||||
@@ -260,7 +258,7 @@ pub fn find_end_of_wal(
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
@@ -276,7 +274,7 @@ pub fn find_end_of_wal(
|
||||
match decoder.poll_decode() {
|
||||
Ok(Some(record)) => result = record.0,
|
||||
Err(e) => {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
||||
result, e
|
||||
);
|
||||
|
||||
@@ -15,6 +15,7 @@ use tokio::sync::mpsc;
|
||||
use crate::cancel_token::RawCancelToken;
|
||||
use crate::codec::{BackendMessages, FrontendMessage, RecordNotices};
|
||||
use crate::config::{Host, SslMode};
|
||||
use crate::connection::gc_bytesmut;
|
||||
use crate::query::RowStream;
|
||||
use crate::simple_query::SimpleQueryStream;
|
||||
use crate::types::{Oid, Type};
|
||||
@@ -95,20 +96,13 @@ impl InnerClient {
|
||||
Ok(PartialQuery(Some(self)))
|
||||
}
|
||||
|
||||
// pub fn send_with_sync<F>(&mut self, f: F) -> Result<&mut Responses, Error>
|
||||
// where
|
||||
// F: FnOnce(&mut BytesMut) -> Result<(), Error>,
|
||||
// {
|
||||
// self.start()?.send_with_sync(f)
|
||||
// }
|
||||
|
||||
pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> {
|
||||
self.responses.waiting += 1;
|
||||
|
||||
self.buffer.clear();
|
||||
// simple queries do not need sync.
|
||||
frontend::query(query, &mut self.buffer).map_err(Error::encode)?;
|
||||
let buf = self.buffer.split().freeze();
|
||||
let buf = self.buffer.split();
|
||||
self.send_message(FrontendMessage::Raw(buf))
|
||||
}
|
||||
|
||||
@@ -125,7 +119,7 @@ impl Drop for PartialQuery<'_> {
|
||||
if let Some(client) = self.0.take() {
|
||||
client.buffer.clear();
|
||||
frontend::sync(&mut client.buffer);
|
||||
let buf = client.buffer.split().freeze();
|
||||
let buf = client.buffer.split();
|
||||
let _ = client.send_message(FrontendMessage::Raw(buf));
|
||||
}
|
||||
}
|
||||
@@ -141,7 +135,7 @@ impl<'a> PartialQuery<'a> {
|
||||
client.buffer.clear();
|
||||
f(&mut client.buffer)?;
|
||||
frontend::flush(&mut client.buffer);
|
||||
let buf = client.buffer.split().freeze();
|
||||
let buf = client.buffer.split();
|
||||
client.send_message(FrontendMessage::Raw(buf))
|
||||
}
|
||||
|
||||
@@ -154,7 +148,7 @@ impl<'a> PartialQuery<'a> {
|
||||
client.buffer.clear();
|
||||
f(&mut client.buffer)?;
|
||||
frontend::sync(&mut client.buffer);
|
||||
let buf = client.buffer.split().freeze();
|
||||
let buf = client.buffer.split();
|
||||
let _ = client.send_message(FrontendMessage::Raw(buf));
|
||||
|
||||
Ok(&mut self.0.take().unwrap().responses)
|
||||
@@ -191,6 +185,7 @@ impl Client {
|
||||
ssl_mode: SslMode,
|
||||
process_id: i32,
|
||||
secret_key: i32,
|
||||
write_buf: BytesMut,
|
||||
) -> Client {
|
||||
Client {
|
||||
inner: InnerClient {
|
||||
@@ -201,7 +196,7 @@ impl Client {
|
||||
waiting: 0,
|
||||
received: 0,
|
||||
},
|
||||
buffer: Default::default(),
|
||||
buffer: write_buf,
|
||||
},
|
||||
cached_typeinfo: Default::default(),
|
||||
|
||||
@@ -292,8 +287,35 @@ impl Client {
|
||||
simple_query::batch_execute(self.inner_mut(), query).await
|
||||
}
|
||||
|
||||
pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
|
||||
self.batch_execute("discard all").await
|
||||
/// Similar to `discard_all`, but it does not clear any query plans
|
||||
///
|
||||
/// This runs in the background, so it can be executed without `await`ing.
|
||||
pub fn reset_session_background(&mut self) -> Result<(), Error> {
|
||||
// "CLOSE ALL": closes any cursors
|
||||
// "SET SESSION AUTHORIZATION DEFAULT": resets the current_user back to the session_user
|
||||
// "RESET ALL": resets any GUCs back to their session defaults.
|
||||
// "DEALLOCATE ALL": deallocates any prepared statements
|
||||
// "UNLISTEN *": stops listening on all channels
|
||||
// "SELECT pg_advisory_unlock_all();": unlocks all advisory locks
|
||||
// "DISCARD TEMP;": drops all temporary tables
|
||||
// "DISCARD SEQUENCES;": deallocates all cached sequence state
|
||||
|
||||
let _responses = self.inner_mut().send_simple_query(
|
||||
"ROLLBACK;
|
||||
CLOSE ALL;
|
||||
SET SESSION AUTHORIZATION DEFAULT;
|
||||
RESET ALL;
|
||||
DEALLOCATE ALL;
|
||||
UNLISTEN *;
|
||||
SELECT pg_advisory_unlock_all();
|
||||
DISCARD TEMP;
|
||||
DISCARD SEQUENCES;",
|
||||
)?;
|
||||
|
||||
// Clean up memory usage.
|
||||
gc_bytesmut(&mut self.inner_mut().buffer);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Begins a new database transaction.
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use std::io;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use postgres_protocol2::message::backend;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
use tokio_util::codec::{Decoder, Encoder};
|
||||
|
||||
pub enum FrontendMessage {
|
||||
Raw(Bytes),
|
||||
Raw(BytesMut),
|
||||
RecordNotices(RecordNotices),
|
||||
}
|
||||
|
||||
@@ -17,7 +17,10 @@ pub struct RecordNotices {
|
||||
}
|
||||
|
||||
pub enum BackendMessage {
|
||||
Normal { messages: BackendMessages },
|
||||
Normal {
|
||||
messages: BackendMessages,
|
||||
ready: bool,
|
||||
},
|
||||
Async(backend::Message),
|
||||
}
|
||||
|
||||
@@ -40,11 +43,11 @@ impl FallibleIterator for BackendMessages {
|
||||
|
||||
pub struct PostgresCodec;
|
||||
|
||||
impl Encoder<Bytes> for PostgresCodec {
|
||||
impl Encoder<BytesMut> for PostgresCodec {
|
||||
type Error = io::Error;
|
||||
|
||||
fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> io::Result<()> {
|
||||
dst.extend_from_slice(&item);
|
||||
fn encode(&mut self, item: BytesMut, dst: &mut BytesMut) -> io::Result<()> {
|
||||
dst.unsplit(item);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -56,6 +59,7 @@ impl Decoder for PostgresCodec {
|
||||
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
|
||||
let mut idx = 0;
|
||||
|
||||
let mut ready = false;
|
||||
while let Some(header) = backend::Header::parse(&src[idx..])? {
|
||||
let len = header.len() as usize + 1;
|
||||
if src[idx..].len() < len {
|
||||
@@ -79,6 +83,7 @@ impl Decoder for PostgresCodec {
|
||||
idx += len;
|
||||
|
||||
if header.tag() == backend::READY_FOR_QUERY_TAG {
|
||||
ready = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -88,6 +93,7 @@ impl Decoder for PostgresCodec {
|
||||
} else {
|
||||
Ok(Some(BackendMessage::Normal {
|
||||
messages: BackendMessages(src.split_to(idx)),
|
||||
ready,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,9 +11,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::net::TcpStream;
|
||||
|
||||
use crate::connect::connect;
|
||||
use crate::connect_raw::{RawConnection, connect_raw};
|
||||
use crate::connect_raw::{self, StartupStream};
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::maybe_tls_stream::MaybeTlsStream;
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream};
|
||||
use crate::{Client, Connection, Error};
|
||||
|
||||
@@ -244,24 +243,27 @@ impl Config {
|
||||
&self,
|
||||
stream: S,
|
||||
tls: T,
|
||||
) -> Result<RawConnection<S, T::Stream>, Error>
|
||||
) -> Result<StartupStream<S, T::Stream>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
{
|
||||
let stream = connect_tls(stream, self.ssl_mode, tls).await?;
|
||||
connect_raw(stream, self).await
|
||||
let mut stream = StartupStream::new(stream);
|
||||
connect_raw::authenticate(&mut stream, self).await?;
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
pub async fn authenticate<S, T>(
|
||||
pub fn authenticate<S, T>(
|
||||
&self,
|
||||
stream: MaybeTlsStream<S, T>,
|
||||
) -> Result<RawConnection<S, T>, Error>
|
||||
stream: &mut StartupStream<S, T>,
|
||||
) -> impl Future<Output = Result<(), Error>>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsStream + Unpin,
|
||||
{
|
||||
connect_raw(stream, self).await
|
||||
connect_raw::authenticate(stream, self)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
use std::net::IpAddr;
|
||||
|
||||
use futures_util::TryStreamExt;
|
||||
use postgres_protocol2::message::backend::Message;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
use crate::client::SocketConfig;
|
||||
use crate::config::Host;
|
||||
use crate::connect_raw::connect_raw;
|
||||
use crate::config::{Host, SslMode};
|
||||
use crate::connect_raw::StartupStream;
|
||||
use crate::connect_socket::connect_socket;
|
||||
use crate::connect_tls::connect_tls;
|
||||
use crate::tls::{MakeTlsConnect, TlsConnect};
|
||||
use crate::{Client, Config, Connection, Error, RawConnection};
|
||||
use crate::{Client, Config, Connection, Error};
|
||||
|
||||
pub async fn connect<T>(
|
||||
tls: &T,
|
||||
@@ -43,34 +45,78 @@ where
|
||||
T: TlsConnect<TcpStream>,
|
||||
{
|
||||
let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
|
||||
let stream = connect_tls(socket, config.ssl_mode, tls).await?;
|
||||
let RawConnection {
|
||||
let stream = config.tls_and_authenticate(socket, tls).await?;
|
||||
managed(
|
||||
stream,
|
||||
parameters: _,
|
||||
delayed_notice: _,
|
||||
process_id,
|
||||
secret_key,
|
||||
} = connect_raw(stream, config).await?;
|
||||
host_addr,
|
||||
host.clone(),
|
||||
port,
|
||||
config.ssl_mode,
|
||||
config.connect_timeout,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn managed<TlsStream>(
|
||||
mut stream: StartupStream<TcpStream, TlsStream>,
|
||||
host_addr: Option<IpAddr>,
|
||||
host: Host,
|
||||
port: u16,
|
||||
ssl_mode: SslMode,
|
||||
connect_timeout: Option<std::time::Duration>,
|
||||
) -> Result<(Client, Connection<TcpStream, TlsStream>), Error>
|
||||
where
|
||||
TlsStream: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let (process_id, secret_key) = wait_until_ready(&mut stream).await?;
|
||||
|
||||
let socket_config = SocketConfig {
|
||||
host_addr,
|
||||
host: host.clone(),
|
||||
host,
|
||||
port,
|
||||
connect_timeout: config.connect_timeout,
|
||||
connect_timeout,
|
||||
};
|
||||
|
||||
let mut stream = stream.into_framed();
|
||||
let write_buf = std::mem::take(stream.write_buffer_mut());
|
||||
|
||||
let (client_tx, conn_rx) = mpsc::unbounded_channel();
|
||||
let (conn_tx, client_rx) = mpsc::channel(4);
|
||||
let client = Client::new(
|
||||
client_tx,
|
||||
client_rx,
|
||||
socket_config,
|
||||
config.ssl_mode,
|
||||
ssl_mode,
|
||||
process_id,
|
||||
secret_key,
|
||||
write_buf,
|
||||
);
|
||||
|
||||
let connection = Connection::new(stream, conn_tx, conn_rx);
|
||||
|
||||
Ok((client, connection))
|
||||
}
|
||||
|
||||
async fn wait_until_ready<S, T>(stream: &mut StartupStream<S, T>) -> Result<(i32, i32), Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let mut process_id = 0;
|
||||
let mut secret_key = 0;
|
||||
|
||||
loop {
|
||||
match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::BackendKeyData(body)) => {
|
||||
process_id = body.process_id();
|
||||
secret_key = body.secret_key();
|
||||
}
|
||||
// These values are currently not used by `Client`/`Connection`. Ignore them.
|
||||
Some(Message::ParameterStatus(_)) | Some(Message::NoticeResponse(_)) => {}
|
||||
Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key)),
|
||||
Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
|
||||
Some(_) => return Err(Error::unexpected_message()),
|
||||
None => return Err(Error::closed()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,52 +1,27 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
use std::task::{Context, Poll, ready};
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures_util::{Sink, SinkExt, Stream, TryStreamExt, ready};
|
||||
use futures_util::{SinkExt, Stream, TryStreamExt};
|
||||
use postgres_protocol2::authentication::sasl;
|
||||
use postgres_protocol2::authentication::sasl::ScramSha256;
|
||||
use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
|
||||
use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
|
||||
use postgres_protocol2::message::frontend;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::codec::Framed;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
|
||||
use tokio_util::codec::{Framed, FramedParts};
|
||||
|
||||
use crate::Error;
|
||||
use crate::codec::{BackendMessage, BackendMessages, PostgresCodec};
|
||||
use crate::codec::PostgresCodec;
|
||||
use crate::config::{self, AuthKeys, Config};
|
||||
use crate::connection::{GC_THRESHOLD, INITIAL_CAPACITY};
|
||||
use crate::maybe_tls_stream::MaybeTlsStream;
|
||||
use crate::tls::TlsStream;
|
||||
|
||||
pub struct StartupStream<S, T> {
|
||||
inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
buf: BackendMessages,
|
||||
delayed_notice: Vec<NoticeResponseBody>,
|
||||
}
|
||||
|
||||
impl<S, T> Sink<Bytes> for StartupStream<S, T>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
type Error = io::Error;
|
||||
|
||||
fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
Pin::new(&mut self.inner).poll_ready(cx)
|
||||
}
|
||||
|
||||
fn start_send(mut self: Pin<&mut Self>, item: Bytes) -> io::Result<()> {
|
||||
Pin::new(&mut self.inner).start_send(item)
|
||||
}
|
||||
|
||||
fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
Pin::new(&mut self.inner).poll_flush(cx)
|
||||
}
|
||||
|
||||
fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
|
||||
Pin::new(&mut self.inner).poll_close(cx)
|
||||
}
|
||||
read_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S, T> Stream for StartupStream<S, T>
|
||||
@@ -56,78 +31,109 @@ where
|
||||
{
|
||||
type Item = io::Result<Message>;
|
||||
|
||||
fn poll_next(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<io::Result<Message>>> {
|
||||
loop {
|
||||
match self.buf.next() {
|
||||
Ok(Some(message)) => return Poll::Ready(Some(Ok(message))),
|
||||
Ok(None) => {}
|
||||
Err(e) => return Poll::Ready(Some(Err(e))),
|
||||
}
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
// We don't use `self.inner.poll_next()` as that might over-read into the read buffer.
|
||||
|
||||
match ready!(Pin::new(&mut self.inner).poll_next(cx)) {
|
||||
Some(Ok(BackendMessage::Normal { messages, .. })) => self.buf = messages,
|
||||
Some(Ok(BackendMessage::Async(message))) => return Poll::Ready(Some(Ok(message))),
|
||||
Some(Err(e)) => return Poll::Ready(Some(Err(e))),
|
||||
None => return Poll::Ready(None),
|
||||
}
|
||||
// read 1 byte tag, 4 bytes length.
|
||||
let header = ready!(self.as_mut().poll_fill_buf_exact(cx, 5)?);
|
||||
|
||||
let len = u32::from_be_bytes(header[1..5].try_into().unwrap());
|
||||
if len < 4 {
|
||||
return Poll::Ready(Some(Err(std::io::Error::other(
|
||||
"postgres message too small",
|
||||
))));
|
||||
}
|
||||
if len >= 65536 {
|
||||
return Poll::Ready(Some(Err(std::io::Error::other(
|
||||
"postgres message too large",
|
||||
))));
|
||||
}
|
||||
|
||||
// the tag is an additional byte.
|
||||
let _message = ready!(self.as_mut().poll_fill_buf_exact(cx, len as usize + 1)?);
|
||||
|
||||
// Message::parse will remove the all the bytes from the buffer.
|
||||
Poll::Ready(Message::parse(&mut self.read_buf).transpose())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RawConnection<S, T> {
|
||||
pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
|
||||
pub parameters: HashMap<String, String>,
|
||||
pub delayed_notice: Vec<NoticeResponseBody>,
|
||||
pub process_id: i32,
|
||||
pub secret_key: i32,
|
||||
}
|
||||
|
||||
pub async fn connect_raw<S, T>(
|
||||
stream: MaybeTlsStream<S, T>,
|
||||
config: &Config,
|
||||
) -> Result<RawConnection<S, T>, Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsStream + Unpin,
|
||||
{
|
||||
let mut stream = StartupStream {
|
||||
inner: Framed::new(stream, PostgresCodec),
|
||||
buf: BackendMessages::empty(),
|
||||
delayed_notice: Vec::new(),
|
||||
};
|
||||
|
||||
startup(&mut stream, config).await?;
|
||||
authenticate(&mut stream, config).await?;
|
||||
let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
|
||||
|
||||
Ok(RawConnection {
|
||||
stream: stream.inner,
|
||||
parameters,
|
||||
delayed_notice: stream.delayed_notice,
|
||||
process_id,
|
||||
secret_key,
|
||||
})
|
||||
}
|
||||
|
||||
async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
|
||||
impl<S, T> StartupStream<S, T>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let mut buf = BytesMut::new();
|
||||
frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
|
||||
/// Fill the buffer until it's the exact length provided. No additional data will be read from the socket.
|
||||
///
|
||||
/// If the current buffer length is greater, nothing happens.
|
||||
fn poll_fill_buf_exact(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
len: usize,
|
||||
) -> Poll<Result<&[u8], std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
let mut stream = Pin::new(this.inner.get_mut());
|
||||
|
||||
stream.send(buf.freeze()).await.map_err(Error::io)
|
||||
let mut n = this.read_buf.len();
|
||||
while n < len {
|
||||
this.read_buf.resize(len, 0);
|
||||
|
||||
let mut buf = ReadBuf::new(&mut this.read_buf[..]);
|
||||
buf.set_filled(n);
|
||||
|
||||
if stream.as_mut().poll_read(cx, &mut buf)?.is_pending() {
|
||||
this.read_buf.truncate(n);
|
||||
return Poll::Pending;
|
||||
}
|
||||
|
||||
if buf.filled().len() == n {
|
||||
return Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"early eof",
|
||||
)));
|
||||
}
|
||||
n = buf.filled().len();
|
||||
|
||||
this.read_buf.truncate(n);
|
||||
}
|
||||
|
||||
Poll::Ready(Ok(&this.read_buf[..len]))
|
||||
}
|
||||
|
||||
pub fn into_framed(mut self) -> Framed<MaybeTlsStream<S, T>, PostgresCodec> {
|
||||
*self.inner.read_buffer_mut() = self.read_buf;
|
||||
self.inner
|
||||
}
|
||||
|
||||
pub fn new(io: MaybeTlsStream<S, T>) -> Self {
|
||||
let mut parts = FramedParts::new(io, PostgresCodec);
|
||||
parts.write_buf = BytesMut::with_capacity(INITIAL_CAPACITY);
|
||||
|
||||
let mut inner = Framed::from_parts(parts);
|
||||
|
||||
// This is the default already, but nice to be explicit.
|
||||
// We divide by two because writes will overshoot the boundary.
|
||||
// We don't want constant overshoots to cause us to constantly re-shrink the buffer.
|
||||
inner.set_backpressure_boundary(GC_THRESHOLD / 2);
|
||||
|
||||
Self {
|
||||
inner,
|
||||
read_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn authenticate<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
|
||||
pub(crate) async fn authenticate<S, T>(
|
||||
stream: &mut StartupStream<S, T>,
|
||||
config: &Config,
|
||||
) -> Result<(), Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsStream + Unpin,
|
||||
{
|
||||
frontend::startup_message(&config.server_params, stream.inner.write_buffer_mut())
|
||||
.map_err(Error::encode)?;
|
||||
|
||||
stream.inner.flush().await.map_err(Error::io)?;
|
||||
match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::AuthenticationOk) => {
|
||||
can_skip_channel_binding(config)?;
|
||||
@@ -141,7 +147,8 @@ where
|
||||
.as_ref()
|
||||
.ok_or_else(|| Error::config("password missing".into()))?;
|
||||
|
||||
authenticate_password(stream, pass).await?;
|
||||
frontend::password_message(pass, stream.inner.write_buffer_mut())
|
||||
.map_err(Error::encode)?;
|
||||
}
|
||||
Some(Message::AuthenticationSasl(body)) => {
|
||||
authenticate_sasl(stream, body, config).await?;
|
||||
@@ -160,6 +167,7 @@ where
|
||||
None => return Err(Error::closed()),
|
||||
}
|
||||
|
||||
stream.inner.flush().await.map_err(Error::io)?;
|
||||
match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::AuthenticationOk) => Ok(()),
|
||||
Some(Message::ErrorResponse(body)) => Err(Error::db(body)),
|
||||
@@ -177,20 +185,6 @@ fn can_skip_channel_binding(config: &Config) -> Result<(), Error> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn authenticate_password<S, T>(
|
||||
stream: &mut StartupStream<S, T>,
|
||||
password: &[u8],
|
||||
) -> Result<(), Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let mut buf = BytesMut::new();
|
||||
frontend::password_message(password, &mut buf).map_err(Error::encode)?;
|
||||
|
||||
stream.send(buf.freeze()).await.map_err(Error::io)
|
||||
}
|
||||
|
||||
async fn authenticate_sasl<S, T>(
|
||||
stream: &mut StartupStream<S, T>,
|
||||
body: AuthenticationSaslBody,
|
||||
@@ -245,10 +239,10 @@ where
|
||||
return Err(Error::config("password or auth keys missing".into()));
|
||||
};
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?;
|
||||
stream.send(buf.freeze()).await.map_err(Error::io)?;
|
||||
frontend::sasl_initial_response(mechanism, scram.message(), stream.inner.write_buffer_mut())
|
||||
.map_err(Error::encode)?;
|
||||
|
||||
stream.inner.flush().await.map_err(Error::io)?;
|
||||
let body = match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::AuthenticationSaslContinue(body)) => body,
|
||||
Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
|
||||
@@ -261,10 +255,10 @@ where
|
||||
.await
|
||||
.map_err(|e| Error::authentication(e.into()))?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?;
|
||||
stream.send(buf.freeze()).await.map_err(Error::io)?;
|
||||
frontend::sasl_response(scram.message(), stream.inner.write_buffer_mut())
|
||||
.map_err(Error::encode)?;
|
||||
|
||||
stream.inner.flush().await.map_err(Error::io)?;
|
||||
let body = match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::AuthenticationSaslFinal(body)) => body,
|
||||
Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
|
||||
@@ -278,35 +272,3 @@ where
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_info<S, T>(
|
||||
stream: &mut StartupStream<S, T>,
|
||||
) -> Result<(i32, i32, HashMap<String, String>), Error>
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
let mut process_id = 0;
|
||||
let mut secret_key = 0;
|
||||
let mut parameters = HashMap::new();
|
||||
|
||||
loop {
|
||||
match stream.try_next().await.map_err(Error::io)? {
|
||||
Some(Message::BackendKeyData(body)) => {
|
||||
process_id = body.process_id();
|
||||
secret_key = body.secret_key();
|
||||
}
|
||||
Some(Message::ParameterStatus(body)) => {
|
||||
parameters.insert(
|
||||
body.name().map_err(Error::parse)?.to_string(),
|
||||
body.value().map_err(Error::parse)?.to_string(),
|
||||
);
|
||||
}
|
||||
Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body),
|
||||
Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
|
||||
Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
|
||||
Some(_) => return Err(Error::unexpected_message()),
|
||||
None => return Err(Error::closed()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,27 @@ pub struct Connection<S, T> {
|
||||
state: State,
|
||||
}
|
||||
|
||||
pub const INITIAL_CAPACITY: usize = 2 * 1024;
|
||||
pub const GC_THRESHOLD: usize = 16 * 1024;
|
||||
|
||||
/// Gargabe collect the [`BytesMut`] if it has too much spare capacity.
|
||||
pub fn gc_bytesmut(buf: &mut BytesMut) {
|
||||
// We use a different mode to shrink the buf when above the threshold.
|
||||
// When above the threshold, we only re-allocate when the buf has 2x spare capacity.
|
||||
let reclaim = GC_THRESHOLD.checked_sub(buf.len()).unwrap_or(buf.len());
|
||||
|
||||
// `try_reclaim` tries to get the capacity from any shared `BytesMut`s,
|
||||
// before then comparing the length against the capacity.
|
||||
if buf.try_reclaim(reclaim) {
|
||||
let capacity = usize::max(buf.len(), INITIAL_CAPACITY);
|
||||
|
||||
// Allocate a new `BytesMut` so that we deallocate the old version.
|
||||
let mut new = BytesMut::with_capacity(capacity);
|
||||
new.extend_from_slice(buf);
|
||||
*buf = new;
|
||||
}
|
||||
}
|
||||
|
||||
pub enum Never {}
|
||||
|
||||
impl<S, T> Connection<S, T>
|
||||
@@ -86,7 +107,14 @@ where
|
||||
continue;
|
||||
}
|
||||
BackendMessage::Async(_) => continue,
|
||||
BackendMessage::Normal { messages } => messages,
|
||||
BackendMessage::Normal { messages, ready } => {
|
||||
// if we read a ReadyForQuery from postgres, let's try GC the read buffer.
|
||||
if ready {
|
||||
gc_bytesmut(self.stream.read_buffer_mut());
|
||||
}
|
||||
|
||||
messages
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -177,12 +205,7 @@ where
|
||||
// Send a terminate message to postgres
|
||||
Poll::Ready(None) => {
|
||||
trace!("poll_write: at eof, terminating");
|
||||
let mut request = BytesMut::new();
|
||||
frontend::terminate(&mut request);
|
||||
|
||||
Pin::new(&mut self.stream)
|
||||
.start_send(request.freeze())
|
||||
.map_err(Error::io)?;
|
||||
frontend::terminate(self.stream.write_buffer_mut());
|
||||
|
||||
trace!("poll_write: sent eof, closing");
|
||||
trace!("poll_write: done");
|
||||
@@ -205,6 +228,13 @@ where
|
||||
{
|
||||
Poll::Ready(()) => {
|
||||
trace!("poll_flush: flushed");
|
||||
|
||||
// Since our codec prefers to share the buffer with the `Client`,
|
||||
// if we don't release our share, then the `Client` would have to re-alloc
|
||||
// the buffer when they next use it.
|
||||
debug_assert!(self.stream.write_buffer().is_empty());
|
||||
*self.stream.write_buffer_mut() = BytesMut::new();
|
||||
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
Poll::Pending => {
|
||||
|
||||
@@ -9,7 +9,7 @@ use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
|
||||
pub use self::sqlstate::*;
|
||||
|
||||
#[allow(clippy::unreadable_literal)]
|
||||
mod sqlstate;
|
||||
pub mod sqlstate;
|
||||
|
||||
/// The severity of a Postgres error or notice.
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
@@ -452,16 +452,16 @@ impl Error {
|
||||
Error(Box::new(ErrorInner { kind, cause }))
|
||||
}
|
||||
|
||||
pub(crate) fn closed() -> Error {
|
||||
pub fn closed() -> Error {
|
||||
Error::new(Kind::Closed, None)
|
||||
}
|
||||
|
||||
pub(crate) fn unexpected_message() -> Error {
|
||||
pub fn unexpected_message() -> Error {
|
||||
Error::new(Kind::UnexpectedMessage, None)
|
||||
}
|
||||
|
||||
#[allow(clippy::needless_pass_by_value)]
|
||||
pub(crate) fn db(error: ErrorResponseBody) -> Error {
|
||||
pub fn db(error: ErrorResponseBody) -> Error {
|
||||
match DbError::parse(&mut error.fields()) {
|
||||
Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
|
||||
Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
|
||||
@@ -493,7 +493,7 @@ impl Error {
|
||||
Error::new(Kind::Tls, Some(e))
|
||||
}
|
||||
|
||||
pub(crate) fn io(e: io::Error) -> Error {
|
||||
pub fn io(e: io::Error) -> Error {
|
||||
Error::new(Kind::Io, Some(Box::new(e)))
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ use postgres_protocol2::message::backend::ReadyForQueryBody;
|
||||
pub use crate::cancel_token::{CancelToken, RawCancelToken};
|
||||
pub use crate::client::{Client, SocketConfig};
|
||||
pub use crate::config::Config;
|
||||
pub use crate::connect_raw::RawConnection;
|
||||
pub use crate::connection::Connection;
|
||||
pub use crate::error::Error;
|
||||
pub use crate::generic_client::GenericClient;
|
||||
@@ -49,8 +48,8 @@ mod cancel_token;
|
||||
mod client;
|
||||
mod codec;
|
||||
pub mod config;
|
||||
mod connect;
|
||||
mod connect_raw;
|
||||
pub mod connect;
|
||||
pub mod connect_raw;
|
||||
mod connect_socket;
|
||||
mod connect_tls;
|
||||
mod connection;
|
||||
|
||||
@@ -301,7 +301,12 @@ pub struct PullTimelineRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub http_hosts: Vec<String>,
|
||||
pub ignore_tombstone: Option<bool>,
|
||||
/// Membership configuration to switch to after pull.
|
||||
/// It guarantees that if pull_timeline returns successfully, the timeline will
|
||||
/// not be deleted by request with an older generation.
|
||||
/// Storage controller always sets this field.
|
||||
/// None is only allowed for manual pull_timeline requests.
|
||||
pub mconf: Option<Configuration>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
hyper0.workspace = true
|
||||
opentelemetry = { workspace = true, features = ["trace"] }
|
||||
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] }
|
||||
opentelemetry-semantic-conventions.workspace = true
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -49,7 +49,7 @@ impl PerfSpan {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn enter(&self) -> PerfSpanEntered {
|
||||
pub fn enter(&self) -> PerfSpanEntered<'_> {
|
||||
if let Some(ref id) = self.inner.id() {
|
||||
self.dispatch.enter(id);
|
||||
}
|
||||
|
||||
@@ -34,13 +34,16 @@ macro_rules! critical {
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! critical_timeline {
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $($arg:tt)*) => {{
|
||||
($tenant_shard_id:expr, $timeline_id:expr, $corruption_detected:expr, $($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) {
|
||||
panic!($($arg)*);
|
||||
}
|
||||
// Increment both metrics
|
||||
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
|
||||
$crate::logging::HADRON_CRITICAL_STORAGE_EVENT_COUNT_METRIC.inc(&$tenant_shard_id.to_string(), &$timeline_id.to_string());
|
||||
if let Some(c) = $corruption_detected.as_ref() {
|
||||
c.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
tracing::error!("CRITICAL: [tenant_shard_id: {}, timeline_id: {}] {}\n{backtrace}",
|
||||
$tenant_shard_id, $timeline_id, format!($($arg)*));
|
||||
|
||||
@@ -32,6 +32,9 @@ pub struct PageserverFeedback {
|
||||
pub replytime: SystemTime,
|
||||
/// Used to track feedbacks from different shards. Always zero for unsharded tenants.
|
||||
pub shard_number: u32,
|
||||
/// If true, the pageserver has detected corruption and the safekeeper and postgres
|
||||
/// should stop sending WAL.
|
||||
pub corruption_detected: bool,
|
||||
}
|
||||
|
||||
impl PageserverFeedback {
|
||||
@@ -43,6 +46,7 @@ impl PageserverFeedback {
|
||||
disk_consistent_lsn: Lsn::INVALID,
|
||||
replytime: *PG_EPOCH,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +105,13 @@ impl PageserverFeedback {
|
||||
buf.put_u32(self.shard_number);
|
||||
}
|
||||
|
||||
if self.corruption_detected {
|
||||
nkeys += 1;
|
||||
buf.put_slice(b"corruption_detected\0");
|
||||
buf.put_i32(1);
|
||||
buf.put_u8(1);
|
||||
}
|
||||
|
||||
buf[buf_ptr] = nkeys;
|
||||
}
|
||||
|
||||
@@ -147,6 +158,11 @@ impl PageserverFeedback {
|
||||
assert_eq!(len, 4);
|
||||
rf.shard_number = buf.get_u32();
|
||||
}
|
||||
b"corruption_detected" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 1);
|
||||
rf.corruption_detected = buf.get_u8() != 0;
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
@@ -206,6 +222,26 @@ mod tests {
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
// Test that databricks-specific fields added to the PageserverFeedback message are serialized
|
||||
// and deserialized correctly, in addition to the existing fields from upstream.
|
||||
#[test]
|
||||
fn test_replication_feedback_databricks_fields() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
rf.current_timeline_size = 12345678;
|
||||
rf.last_received_lsn = Lsn(23456789);
|
||||
rf.disk_consistent_lsn = Lsn(34567890);
|
||||
rf.remote_consistent_lsn = Lsn(45678901);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.shard_number = 1;
|
||||
rf.corruption_detected = true;
|
||||
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
|
||||
@@ -59,6 +59,10 @@ impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
|
||||
pub fn unsharded() -> Self {
|
||||
ShardCount(0)
|
||||
}
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as [`TenantShardId::unsharded`].
|
||||
|
||||
@@ -426,12 +426,15 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
remote_consistent_lsn: 0,
|
||||
replytime: 0,
|
||||
shard_number: 0,
|
||||
corruption_detected: false,
|
||||
};
|
||||
|
||||
let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
|
||||
effective_max_wal_bytes_per_second: crate::bindings::pg_atomic_uint32 { value: 0 },
|
||||
should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
|
||||
sent_bytes: 0,
|
||||
last_recorded_time_us: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
batch_start_time_us: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
batch_end_time_us: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
};
|
||||
|
||||
crate::bindings::WalproposerShmemState {
|
||||
|
||||
@@ -14,9 +14,9 @@ use utils::logging::warn_slow;
|
||||
|
||||
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
|
||||
use crate::retry::Retry;
|
||||
use crate::split::GetPageSplitter;
|
||||
use compute_api::spec::PageserverProtocol;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::GetPageSplitter;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};
|
||||
|
||||
@@ -230,16 +230,14 @@ impl PageserverClient {
|
||||
) -> tonic::Result<page_api::GetPageResponse> {
|
||||
// Fast path: request is for a single shard.
|
||||
if let Some(shard_id) =
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?
|
||||
GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)?
|
||||
{
|
||||
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
|
||||
}
|
||||
|
||||
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
|
||||
// reassemble the responses.
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size)?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_id, shard_req) in splitter.drain_requests() {
|
||||
@@ -249,14 +247,10 @@ impl PageserverClient {
|
||||
}
|
||||
|
||||
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter
|
||||
.add_response(shard_id, shard_response)
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))?;
|
||||
splitter.add_response(shard_id, shard_response)?;
|
||||
}
|
||||
|
||||
splitter
|
||||
.get_response()
|
||||
.map_err(|err| tonic::Status::internal(err.to_string()))
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
|
||||
/// Fetches pages on the given shard. Does not retry internally.
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod client;
|
||||
mod pool;
|
||||
mod retry;
|
||||
mod split;
|
||||
|
||||
pub use client::{PageserverClient, ShardSpec};
|
||||
|
||||
@@ -19,7 +19,9 @@ pub mod proto {
|
||||
}
|
||||
|
||||
mod client;
|
||||
pub use client::Client;
|
||||
mod model;
|
||||
mod split;
|
||||
|
||||
pub use client::Client;
|
||||
pub use model::*;
|
||||
pub use split::{GetPageSplitter, SplitError};
|
||||
|
||||
@@ -1,20 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::anyhow;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::model::*;
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::shard::key_to_shard_number;
|
||||
use pageserver_page_api as page_api;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardStripeSize};
|
||||
|
||||
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
|
||||
/// TODO: add tests for this.
|
||||
pub struct GetPageSplitter {
|
||||
/// Split requests by shard index.
|
||||
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
|
||||
requests: HashMap<ShardIndex, GetPageRequest>,
|
||||
/// The response being assembled. Preallocated with empty pages, to be filled in.
|
||||
response: page_api::GetPageResponse,
|
||||
response: GetPageResponse,
|
||||
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
|
||||
/// to assemble the response pages in the same order as the original request.
|
||||
block_shards: Vec<ShardIndex>,
|
||||
@@ -24,22 +23,22 @@ impl GetPageSplitter {
|
||||
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
|
||||
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
|
||||
pub fn for_single_shard(
|
||||
req: &page_api::GetPageRequest,
|
||||
req: &GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Option<ShardIndex>> {
|
||||
) -> Result<Option<ShardIndex>, SplitError> {
|
||||
// Fast path: unsharded tenant.
|
||||
if count.is_unsharded() {
|
||||
return Ok(Some(ShardIndex::unsharded()));
|
||||
}
|
||||
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Find the first page's shard, for comparison.
|
||||
let Some(&first_page) = req.block_numbers.first() else {
|
||||
return Err(anyhow!("no block numbers in request"));
|
||||
return Err("no block numbers in request".into());
|
||||
};
|
||||
let key = rel_block_to_key(req.rel, first_page);
|
||||
let shard_number = key_to_shard_number(count, stripe_size, &key);
|
||||
@@ -57,10 +56,10 @@ impl GetPageSplitter {
|
||||
|
||||
/// Splits the given request.
|
||||
pub fn split(
|
||||
req: page_api::GetPageRequest,
|
||||
req: GetPageRequest,
|
||||
count: ShardCount,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Result<Self, SplitError> {
|
||||
// The caller should make sure we don't split requests unnecessarily.
|
||||
debug_assert!(
|
||||
Self::for_single_shard(&req, count, stripe_size)?.is_none(),
|
||||
@@ -68,10 +67,10 @@ impl GetPageSplitter {
|
||||
);
|
||||
|
||||
if count.is_unsharded() {
|
||||
return Err(anyhow!("unsharded tenant, no point in splitting request"));
|
||||
return Err("unsharded tenant, no point in splitting request".into());
|
||||
}
|
||||
let Some(stripe_size) = stripe_size else {
|
||||
return Err(anyhow!("stripe size must be given for sharded tenants"));
|
||||
return Err("stripe size must be given for sharded tenants".into());
|
||||
};
|
||||
|
||||
// Split the requests by shard index.
|
||||
@@ -84,7 +83,7 @@ impl GetPageSplitter {
|
||||
|
||||
requests
|
||||
.entry(shard_id)
|
||||
.or_insert_with(|| page_api::GetPageRequest {
|
||||
.or_insert_with(|| GetPageRequest {
|
||||
request_id: req.request_id,
|
||||
request_class: req.request_class,
|
||||
rel: req.rel,
|
||||
@@ -98,16 +97,16 @@ impl GetPageSplitter {
|
||||
|
||||
// Construct a response to be populated by shard responses. Preallocate empty page slots
|
||||
// with the expected block numbers.
|
||||
let response = page_api::GetPageResponse {
|
||||
let response = GetPageResponse {
|
||||
request_id: req.request_id,
|
||||
status_code: page_api::GetPageStatusCode::Ok,
|
||||
status_code: GetPageStatusCode::Ok,
|
||||
reason: None,
|
||||
rel: req.rel,
|
||||
pages: req
|
||||
.block_numbers
|
||||
.into_iter()
|
||||
.map(|block_number| {
|
||||
page_api::Page {
|
||||
Page {
|
||||
block_number,
|
||||
image: Bytes::new(), // empty page slot to be filled in
|
||||
}
|
||||
@@ -123,43 +122,38 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
|
||||
pub fn drain_requests(
|
||||
&mut self,
|
||||
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
|
||||
pub fn drain_requests(&mut self) -> impl Iterator<Item = (ShardIndex, GetPageRequest)> {
|
||||
self.requests.drain()
|
||||
}
|
||||
|
||||
/// Adds a response from the given shard. The response must match the request ID and have an OK
|
||||
/// status code. A response must not already exist for the given shard ID.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn add_response(
|
||||
&mut self,
|
||||
shard_id: ShardIndex,
|
||||
response: page_api::GetPageResponse,
|
||||
) -> anyhow::Result<()> {
|
||||
response: GetPageResponse,
|
||||
) -> Result<(), SplitError> {
|
||||
// The caller should already have converted status codes into tonic::Status.
|
||||
if response.status_code != page_api::GetPageStatusCode::Ok {
|
||||
return Err(anyhow!(
|
||||
if response.status_code != GetPageStatusCode::Ok {
|
||||
return Err(SplitError(format!(
|
||||
"unexpected non-OK response for shard {shard_id}: {} {}",
|
||||
response.status_code,
|
||||
response.reason.unwrap_or_default()
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
if response.request_id != self.response.request_id {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"response ID mismatch for shard {shard_id}: expected {}, got {}",
|
||||
self.response.request_id,
|
||||
response.request_id
|
||||
));
|
||||
self.response.request_id, response.request_id
|
||||
)));
|
||||
}
|
||||
|
||||
// Place the shard response pages into the assembled response, in request order.
|
||||
@@ -171,26 +165,27 @@ impl GetPageSplitter {
|
||||
}
|
||||
|
||||
let Some(slot) = self.response.pages.get_mut(i) else {
|
||||
return Err(anyhow!("no block_shards slot {i} for shard {shard_id}"));
|
||||
return Err(SplitError(format!(
|
||||
"no block_shards slot {i} for shard {shard_id}"
|
||||
)));
|
||||
};
|
||||
let Some(page) = pages.next() else {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} in shard {shard_id} response",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
};
|
||||
if page.block_number != slot.block_number {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned wrong page at index {i}, expected {} got {}",
|
||||
slot.block_number,
|
||||
page.block_number
|
||||
));
|
||||
slot.block_number, page.block_number
|
||||
)));
|
||||
}
|
||||
if !slot.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned duplicate page {} at index {i}",
|
||||
slot.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
*slot = page;
|
||||
@@ -198,32 +193,54 @@ impl GetPageSplitter {
|
||||
|
||||
// Make sure we've consumed all pages from the shard response.
|
||||
if let Some(extra_page) = pages.next() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"shard {shard_id} returned extra page: {}",
|
||||
extra_page.block_number
|
||||
));
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetches the final, assembled response.
|
||||
#[allow(clippy::result_large_err)]
|
||||
pub fn get_response(self) -> anyhow::Result<page_api::GetPageResponse> {
|
||||
/// Collects the final, assembled response.
|
||||
pub fn collect_response(self) -> Result<GetPageResponse, SplitError> {
|
||||
// Check that the response is complete.
|
||||
for (i, page) in self.response.pages.iter().enumerate() {
|
||||
if page.image.is_empty() {
|
||||
return Err(anyhow!(
|
||||
return Err(SplitError(format!(
|
||||
"missing page {} for shard {}",
|
||||
page.block_number,
|
||||
self.block_shards
|
||||
.get(i)
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| "?".to_string())
|
||||
));
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.response)
|
||||
}
|
||||
}
|
||||
|
||||
/// A GetPageSplitter error.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("{0}")]
|
||||
pub struct SplitError(String);
|
||||
|
||||
impl From<&str> for SplitError {
|
||||
fn from(err: &str) -> Self {
|
||||
SplitError(err.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for SplitError {
|
||||
fn from(err: String) -> Self {
|
||||
SplitError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SplitError> for tonic::Status {
|
||||
fn from(err: SplitError) -> Self {
|
||||
tonic::Status::internal(err.0)
|
||||
}
|
||||
}
|
||||
@@ -715,7 +715,7 @@ fn start_pageserver(
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue.new_client(),
|
||||
secondary_controller,
|
||||
feature_resolver,
|
||||
feature_resolver.clone(),
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
@@ -841,14 +841,14 @@ fn start_pageserver(
|
||||
} else {
|
||||
None
|
||||
},
|
||||
feature_resolver.clone(),
|
||||
);
|
||||
|
||||
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for
|
||||
// each stream/request.
|
||||
// Spawn a Pageserver gRPC server task. It will spawn separate tasks for each request/stream.
|
||||
// It uses a separate compute request Tokio runtime (COMPUTE_REQUEST_RUNTIME).
|
||||
//
|
||||
// TODO: this uses a separate Tokio runtime for the page service. If we want
|
||||
// other gRPC services, they will need their own port and runtime. Is this
|
||||
// necessary?
|
||||
// NB: this port is exposed to computes. It should only provide services that we're okay with
|
||||
// computes accessing. Internal services should use a separate port.
|
||||
let mut page_service_grpc = None;
|
||||
if let Some(grpc_listener) = grpc_listener {
|
||||
page_service_grpc = Some(GrpcPageServiceHandler::spawn(
|
||||
|
||||
@@ -484,6 +484,8 @@ async fn build_timeline_info_common(
|
||||
*timeline.get_applied_gc_cutoff_lsn(),
|
||||
);
|
||||
|
||||
let (rel_size_migration, rel_size_migrated_at) = timeline.get_rel_size_v2_status();
|
||||
|
||||
let info = TimelineInfo {
|
||||
tenant_id: timeline.tenant_shard_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
@@ -515,7 +517,8 @@ async fn build_timeline_info_common(
|
||||
|
||||
state,
|
||||
is_archived: Some(is_archived),
|
||||
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
|
||||
rel_size_migration: Some(rel_size_migration),
|
||||
rel_size_migrated_at,
|
||||
is_invisible: Some(is_invisible),
|
||||
|
||||
walreceiver_status,
|
||||
@@ -930,9 +933,16 @@ async fn timeline_patch_index_part_handler(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
if request_data.rel_size_migration.is_none() && request_data.rel_size_migrated_at.is_some()
|
||||
{
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"updating rel_size_migrated_at without rel_size_migration is not allowed"
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(rel_size_migration) = request_data.rel_size_migration {
|
||||
timeline
|
||||
.update_rel_size_v2_status(rel_size_migration)
|
||||
.update_rel_size_v2_status(rel_size_migration, request_data.rel_size_migrated_at)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
@@ -1995,6 +2005,10 @@ async fn put_tenant_location_config_handler(
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
|
||||
fail::fail_point!("put-location-conf-handler", |_| {
|
||||
Err(ApiError::ResourceUnavailable("failpoint".into()))
|
||||
});
|
||||
|
||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||
// its local disk content and drops it from memory.
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
|
||||
@@ -57,7 +57,7 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
|
||||
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
|
||||
// Then fishing out pg_control would be unnecessary
|
||||
let mut modification = tline.begin_modification(pgdata_lsn);
|
||||
let mut modification = tline.begin_modification_for_import(pgdata_lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
// Import all but pg_wal
|
||||
@@ -309,7 +309,7 @@ async fn import_wal(
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification_for_import(last_lsn);
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
@@ -357,7 +357,7 @@ pub async fn import_basebackup_from_tar(
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {base_lsn}");
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
let mut modification = tline.begin_modification_for_import(base_lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
@@ -457,7 +457,7 @@ pub async fn import_wal_from_tar(
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification_for_import(last_lsn);
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
|
||||
@@ -16,7 +16,8 @@ use anyhow::{Context as _, bail};
|
||||
use bytes::{Buf as _, BufMut as _, BytesMut};
|
||||
use chrono::Utc;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, Stream};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, Stream, StreamExt as _};
|
||||
use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -35,8 +36,8 @@ use pageserver_api::pagestream_api::{
|
||||
};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api as page_api;
|
||||
use pageserver_page_api::proto;
|
||||
use pageserver_page_api::{self as page_api, GetPageSplitter};
|
||||
use postgres_backend::{
|
||||
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
|
||||
};
|
||||
@@ -68,6 +69,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::feature_resolver::FeatureResolver;
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
|
||||
@@ -139,6 +141,7 @@ pub fn spawn(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
feature_resolver: FeatureResolver,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -160,6 +163,7 @@ pub fn spawn(
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
feature_resolver,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
@@ -218,6 +222,7 @@ pub async fn libpq_listener_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
feature_resolver: FeatureResolver,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
@@ -261,6 +266,7 @@ pub async fn libpq_listener_main(
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
feature_resolver.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
@@ -303,6 +309,7 @@ async fn page_service_conn_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
feature_resolver: FeatureResolver,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -370,6 +377,7 @@ async fn page_service_conn_main(
|
||||
perf_span_fields,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
feature_resolver.clone(),
|
||||
gate_guard,
|
||||
);
|
||||
let pgbackend =
|
||||
@@ -421,6 +429,8 @@ struct PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
|
||||
feature_resolver: FeatureResolver,
|
||||
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
@@ -457,13 +467,6 @@ impl TimelineHandles {
|
||||
self.handles
|
||||
.get(timeline_id, shard_selector, &self.wrapper)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
timeline::handle::GetError::TenantManager(e) => e,
|
||||
timeline::handle::GetError::PerTimelineStateShutDown => {
|
||||
trace!("per-timeline state shut down");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_id(&self) -> Option<TenantId> {
|
||||
@@ -479,11 +482,9 @@ pub(crate) struct TenantManagerWrapper {
|
||||
tenant_id: once_cell::sync::OnceCell<TenantId>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TenantManagerTypes;
|
||||
|
||||
impl timeline::handle::Types for TenantManagerTypes {
|
||||
type TenantManagerError = GetActiveTimelineError;
|
||||
type TenantManager = TenantManagerWrapper;
|
||||
type Timeline = TenantManagerCacheItem;
|
||||
}
|
||||
@@ -535,6 +536,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
match resolved {
|
||||
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
|
||||
ShardResolveResult::NotFound => {
|
||||
MISROUTED_PAGESTREAM_REQUESTS.inc();
|
||||
return Err(GetActiveTimelineError::Tenant(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(*tenant_id)),
|
||||
));
|
||||
@@ -586,6 +588,15 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether to hold the applied GC cutoff guard when processing GetPage requests.
|
||||
/// This is determined once at the start of pagestream subprotocol handling based on
|
||||
/// feature flags, configuration, and test conditions.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum HoldAppliedGcCutoffGuard {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum PageStreamError {
|
||||
/// We encountered an error that should prompt the client to reconnect:
|
||||
@@ -729,6 +740,7 @@ enum BatchedFeMessage {
|
||||
GetPage {
|
||||
span: Span,
|
||||
shard: WeakHandle<TenantManagerTypes>,
|
||||
applied_gc_cutoff_guard: Option<RcuReadGuard<Lsn>>,
|
||||
pages: SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
batch_break_reason: GetPageBatchBreakReason,
|
||||
},
|
||||
@@ -908,6 +920,7 @@ impl PageServerHandler {
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
feature_resolver: FeatureResolver,
|
||||
gate_guard: GateGuard,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
@@ -919,6 +932,7 @@ impl PageServerHandler {
|
||||
cancel,
|
||||
pipelining_config,
|
||||
get_vectored_concurrent_io,
|
||||
feature_resolver,
|
||||
gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -958,6 +972,7 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
parent_span: Span,
|
||||
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
|
||||
) -> Result<Option<BatchedFeMessage>, QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
|
||||
@@ -1195,19 +1210,27 @@ impl PageServerHandler {
|
||||
})
|
||||
.await?;
|
||||
|
||||
let applied_gc_cutoff_guard = shard.get_applied_gc_cutoff_lsn(); // hold guard
|
||||
// We're holding the Handle
|
||||
let effective_lsn = match Self::effective_request_lsn(
|
||||
&shard,
|
||||
shard.get_last_record_lsn(),
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
&applied_gc_cutoff_guard,
|
||||
) {
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(span, e);
|
||||
}
|
||||
};
|
||||
let applied_gc_cutoff_guard = match hold_gc_cutoff_guard {
|
||||
HoldAppliedGcCutoffGuard::Yes => Some(applied_gc_cutoff_guard),
|
||||
HoldAppliedGcCutoffGuard::No => {
|
||||
drop(applied_gc_cutoff_guard);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let batch_wait_ctx = if ctx.has_perf_span() {
|
||||
Some(
|
||||
@@ -1228,6 +1251,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
applied_gc_cutoff_guard,
|
||||
pages: smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
@@ -1328,13 +1352,28 @@ impl PageServerHandler {
|
||||
match (eligible_batch, this_msg) {
|
||||
(
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: accum_pages, ..
|
||||
pages: accum_pages,
|
||||
applied_gc_cutoff_guard: accum_applied_gc_cutoff_guard,
|
||||
..
|
||||
},
|
||||
BatchedFeMessage::GetPage {
|
||||
pages: this_pages, ..
|
||||
pages: this_pages,
|
||||
applied_gc_cutoff_guard: this_applied_gc_cutoff_guard,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
accum_pages.extend(this_pages);
|
||||
// the minimum of the two guards will keep data for both alive
|
||||
match (&accum_applied_gc_cutoff_guard, this_applied_gc_cutoff_guard) {
|
||||
(None, None) => (),
|
||||
(None, Some(this)) => *accum_applied_gc_cutoff_guard = Some(this),
|
||||
(Some(_), None) => (),
|
||||
(Some(accum), Some(this)) => {
|
||||
if **accum > *this {
|
||||
*accum_applied_gc_cutoff_guard = Some(this);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -1649,6 +1688,7 @@ impl PageServerHandler {
|
||||
BatchedFeMessage::GetPage {
|
||||
span,
|
||||
shard,
|
||||
applied_gc_cutoff_guard,
|
||||
pages,
|
||||
batch_break_reason,
|
||||
} => {
|
||||
@@ -1668,6 +1708,7 @@ impl PageServerHandler {
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
drop(applied_gc_cutoff_guard);
|
||||
res
|
||||
},
|
||||
span,
|
||||
@@ -1749,7 +1790,7 @@ impl PageServerHandler {
|
||||
/// Coding discipline within this function: all interaction with the `pgb` connection
|
||||
/// needs to be sensitive to connection shutdown, currently signalled via [`Self::cancel`].
|
||||
/// This is so that we can shutdown page_service quickly.
|
||||
#[instrument(skip_all)]
|
||||
#[instrument(skip_all, fields(hold_gc_cutoff_guard))]
|
||||
async fn handle_pagerequests<IO>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -1795,6 +1836,30 @@ impl PageServerHandler {
|
||||
.take()
|
||||
.expect("implementation error: timeline_handles should not be locked");
|
||||
|
||||
// Evaluate the expensive feature resolver check once per pagestream subprotocol handling
|
||||
// instead of once per GetPage request. This is shared between pipelined and serial paths.
|
||||
let hold_gc_cutoff_guard = if cfg!(test) || cfg!(feature = "testing") {
|
||||
HoldAppliedGcCutoffGuard::Yes
|
||||
} else {
|
||||
// Use the global feature resolver with the tenant ID directly, avoiding the need
|
||||
// to get a timeline/shard which might not be available on this pageserver node.
|
||||
let empty_properties = std::collections::HashMap::new();
|
||||
match self.feature_resolver.evaluate_boolean(
|
||||
"page-service-getpage-hold-applied-gc-cutoff-guard",
|
||||
tenant_id,
|
||||
&empty_properties,
|
||||
) {
|
||||
Ok(()) => HoldAppliedGcCutoffGuard::Yes,
|
||||
Err(_) => HoldAppliedGcCutoffGuard::No,
|
||||
}
|
||||
};
|
||||
// record it in the span of handle_pagerequests so that both the request_span
|
||||
// and the pipeline implementation spans contains the field.
|
||||
Span::current().record(
|
||||
"hold_gc_cutoff_guard",
|
||||
tracing::field::debug(&hold_gc_cutoff_guard),
|
||||
);
|
||||
|
||||
let request_span = info_span!("request");
|
||||
let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
|
||||
PageServicePipeliningConfig::Pipelined(pipelining_config) => {
|
||||
@@ -1808,6 +1873,7 @@ impl PageServerHandler {
|
||||
pipelining_config,
|
||||
protocol_version,
|
||||
io_concurrency,
|
||||
hold_gc_cutoff_guard,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1822,6 +1888,7 @@ impl PageServerHandler {
|
||||
request_span,
|
||||
protocol_version,
|
||||
io_concurrency,
|
||||
hold_gc_cutoff_guard,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1850,6 +1917,7 @@ impl PageServerHandler {
|
||||
request_span: Span,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
io_concurrency: IoConcurrency,
|
||||
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -1871,6 +1939,7 @@ impl PageServerHandler {
|
||||
ctx,
|
||||
protocol_version,
|
||||
request_span.clone(),
|
||||
hold_gc_cutoff_guard,
|
||||
)
|
||||
.await;
|
||||
let msg = match msg {
|
||||
@@ -1918,6 +1987,7 @@ impl PageServerHandler {
|
||||
pipelining_config: PageServicePipeliningConfigPipelined,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
io_concurrency: IoConcurrency,
|
||||
hold_gc_cutoff_guard: HoldAppliedGcCutoffGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -2021,6 +2091,7 @@ impl PageServerHandler {
|
||||
&ctx,
|
||||
protocol_version,
|
||||
request_span.clone(),
|
||||
hold_gc_cutoff_guard,
|
||||
)
|
||||
.await;
|
||||
let Some(read_res) = read_res.transpose() else {
|
||||
@@ -2067,6 +2138,7 @@ impl PageServerHandler {
|
||||
pages,
|
||||
span: _,
|
||||
shard: _,
|
||||
applied_gc_cutoff_guard: _,
|
||||
batch_break_reason: _,
|
||||
} = &mut batch
|
||||
{
|
||||
@@ -3352,18 +3424,6 @@ impl GrpcPageServiceHandler {
|
||||
Ok(CancellableTask { task, cancel })
|
||||
}
|
||||
|
||||
/// Errors if the request is executed on a non-zero shard. Only shard 0 has a complete view of
|
||||
/// relations and their sizes, as well as SLRU segments and similar data.
|
||||
#[allow(clippy::result_large_err)]
|
||||
fn ensure_shard_zero(timeline: &Handle<TenantManagerTypes>) -> Result<(), tonic::Status> {
|
||||
match timeline.get_shard_index().shard_number.0 {
|
||||
0 => Ok(()),
|
||||
shard => Err(tonic::Status::invalid_argument(format!(
|
||||
"request must execute on shard zero (is shard {shard})",
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates a PagestreamRequest header from a ReadLsn and request ID.
|
||||
fn make_hdr(
|
||||
read_lsn: page_api::ReadLsn,
|
||||
@@ -3378,30 +3438,72 @@ impl GrpcPageServiceHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request.
|
||||
/// Acquires a timeline handle for the given request. The shard index must match a local shard.
|
||||
///
|
||||
/// TODO: during shard splits, the compute may still be sending requests to the parent shard
|
||||
/// until the entire split is committed and the compute is notified. Consider installing a
|
||||
/// temporary shard router from the parent to the children while the split is in progress.
|
||||
///
|
||||
/// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage
|
||||
/// the TimelineHandles lifecycle.
|
||||
///
|
||||
/// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid
|
||||
/// the unnecessary overhead.
|
||||
/// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`].
|
||||
async fn get_request_timeline(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, GetActiveTimelineError> {
|
||||
let ttid = *extract::<TenantTimelineId>(req);
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
TimelineHandles::new(self.tenant_manager.clone())
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Acquires a timeline handle for the given request, which must be for shard zero. Most
|
||||
/// metadata requests are only valid on shard zero.
|
||||
///
|
||||
/// NB: during an ongoing shard split, the compute will keep talking to the parent shard until
|
||||
/// the split is committed, but the parent shard may have been removed in the meanwhile. In that
|
||||
/// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`].
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this child routing.
|
||||
async fn get_request_timeline_shard_zero(
|
||||
&self,
|
||||
req: &tonic::Request<impl Any>,
|
||||
) -> Result<Handle<TenantManagerTypes>, tonic::Status> {
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(req);
|
||||
let shard_index = *extract::<ShardIndex>(req);
|
||||
|
||||
if shard_index.shard_number.0 != 0 {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"request only valid on shard zero (requested shard {shard_index})",
|
||||
)));
|
||||
}
|
||||
|
||||
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
|
||||
// avoid the unnecessary overhead.
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
Ok(timeline) => Ok(timeline),
|
||||
Err(err) => {
|
||||
// We may be in the middle of a shard split. Try to find a child shard 0.
|
||||
if let Ok(timeline) = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await
|
||||
&& timeline.get_shard_index().shard_count > shard_index.shard_count
|
||||
{
|
||||
return Ok(timeline);
|
||||
}
|
||||
Err(err.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start.
|
||||
/// Only errors if the timeline is shutting down.
|
||||
///
|
||||
@@ -3428,32 +3530,37 @@ impl GrpcPageServiceHandler {
|
||||
/// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
|
||||
/// GetPageResponse with an appropriate status code to avoid terminating the stream.
|
||||
///
|
||||
/// TODO: verify that the requested pages belong to this shard.
|
||||
///
|
||||
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
|
||||
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
|
||||
/// split them up in the client or server.
|
||||
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
|
||||
#[instrument(skip_all, fields(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
))]
|
||||
async fn get_page(
|
||||
ctx: &RequestContext,
|
||||
timeline: &WeakHandle<TenantManagerTypes>,
|
||||
req: proto::GetPageRequest,
|
||||
timeline: Handle<TenantManagerTypes>,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<proto::GetPageResponse, tonic::Status> {
|
||||
let received_at = Instant::now();
|
||||
let timeline = timeline.upgrade()?;
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
span_record!(
|
||||
req_id = %req.request_id,
|
||||
rel = %req.rel,
|
||||
blkno = %req.block_numbers[0],
|
||||
blks = %req.block_numbers.len(),
|
||||
lsn = %req.read_lsn,
|
||||
);
|
||||
for &blkno in &req.block_numbers {
|
||||
let shard = timeline.get_shard_identity();
|
||||
let key = rel_block_to_key(req.rel, blkno);
|
||||
if !shard.is_key_local(&key) {
|
||||
return Err(tonic::Status::invalid_argument(format!(
|
||||
"block {blkno} of relation {} requested on wrong shard {} (is on {})",
|
||||
req.rel,
|
||||
timeline.get_shard_index(),
|
||||
ShardIndex::new(shard.get_shard_number(&key), shard.count),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
|
||||
let effective_lsn = PageServerHandler::effective_request_lsn(
|
||||
@@ -3529,7 +3636,89 @@ impl GrpcPageServiceHandler {
|
||||
};
|
||||
}
|
||||
|
||||
Ok(resp.into())
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Processes a GetPage request when there is a potential shard split in progress. We have to
|
||||
/// reroute the request to any local child shards, and split batch requests that straddle
|
||||
/// multiple child shards.
|
||||
///
|
||||
/// Parent shards are split and removed incrementally (there may be many parent shards when
|
||||
/// splitting an already-sharded tenant), but the compute is only notified once the overall
|
||||
/// split commits, which can take several minutes. In the meanwhile, the compute will be sending
|
||||
/// requests to the parent shards.
|
||||
///
|
||||
/// TODO: add test infrastructure to provoke this situation frequently and for long periods of
|
||||
/// time, to properly exercise it.
|
||||
///
|
||||
/// TODO: revamp the split protocol to avoid this, e.g.:
|
||||
/// * Keep the parent shard until the split commits and the compute is notified.
|
||||
/// * Notify the compute about each subsplit.
|
||||
/// * Return an error that updates the compute's shard map.
|
||||
#[instrument(skip_all)]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn maybe_split_get_page(
|
||||
ctx: &RequestContext,
|
||||
handles: &mut TimelineHandles,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
parent: ShardIndex,
|
||||
req: page_api::GetPageRequest,
|
||||
io_concurrency: IoConcurrency,
|
||||
received_at: Instant,
|
||||
) -> Result<page_api::GetPageResponse, tonic::Status> {
|
||||
// Check the first page to see if we have any child shards at all. Otherwise, the compute is
|
||||
// just talking to the wrong Pageserver. If the parent has been split, the shard now owning
|
||||
// the page must have a higher shard count.
|
||||
let timeline = handles
|
||||
.get(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let shard_id = timeline.get_shard_identity();
|
||||
if shard_id.count <= parent.shard_count {
|
||||
return Err(HandleUpgradeError::ShutDown.into()); // emulate original error
|
||||
}
|
||||
|
||||
// Fast path: the request fits in a single shard.
|
||||
if let Some(shard_index) =
|
||||
GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))?
|
||||
{
|
||||
// We got the shard ID from the first page, so these must be equal.
|
||||
assert_eq!(shard_index.shard_number, shard_id.number);
|
||||
assert_eq!(shard_index.shard_count, shard_id.count);
|
||||
return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await;
|
||||
}
|
||||
|
||||
// The request spans multiple shards; split it and dispatch parallel requests. All pages
|
||||
// were originally in the parent shard, and during a split all children are local, so we
|
||||
// expect to find local shards for all pages.
|
||||
let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))?;
|
||||
|
||||
let mut shard_requests = FuturesUnordered::new();
|
||||
for (shard_index, shard_req) in splitter.drain_requests() {
|
||||
let timeline = handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await?;
|
||||
let future = Self::get_page(
|
||||
ctx,
|
||||
timeline,
|
||||
shard_req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.map(move |result| result.map(|resp| (shard_index, resp)));
|
||||
shard_requests.push(future);
|
||||
}
|
||||
|
||||
while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? {
|
||||
splitter.add_response(shard_index, shard_response)?;
|
||||
}
|
||||
|
||||
Ok(splitter.collect_response()?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3558,11 +3747,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
// to be the sweet spot where throughput is saturated.
|
||||
const CHUNK_SIZE: usize = 256 * 1024;
|
||||
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Validate the request and decorate the span.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(tonic::Status::failed_precondition("timeline is archived"));
|
||||
}
|
||||
@@ -3678,11 +3866,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetDbSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetDbSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
|
||||
@@ -3711,14 +3898,29 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<tonic::Streaming<proto::GetPageRequest>>,
|
||||
) -> Result<tonic::Response<Self::GetPagesStream>, tonic::Status> {
|
||||
// Extract the timeline from the request and check that it exists.
|
||||
let ttid = *extract::<TenantTimelineId>(&req);
|
||||
//
|
||||
// NB: during shard splits, the compute may still send requests to the parent shard. We'll
|
||||
// reroute requests to the child shards below, but we also detect the common cases here
|
||||
// where either the shard exists or no shards exist at all. If we have a child shard, we
|
||||
// can't acquire a weak handle because we don't know which child shard to use yet.
|
||||
let TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = *extract::<TenantTimelineId>(&req);
|
||||
let shard_index = *extract::<ShardIndex>(&req);
|
||||
let shard_selector = ShardSelector::Known(shard_index);
|
||||
|
||||
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
|
||||
handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?;
|
||||
let timeline = match handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Known(shard_index))
|
||||
.await
|
||||
{
|
||||
// The timeline shard exists. Keep a weak handle to reuse for each request.
|
||||
Ok(timeline) => Some(timeline.downgrade()),
|
||||
// The shard doesn't exist, but a child shard does. We'll reroute requests later.
|
||||
Err(_) if self.tenant_manager.has_child_shard(tenant_id, shard_index) => None,
|
||||
// Failed to fetch the timeline, and no child shard exists. Error out.
|
||||
Err(err) => return Err(err.into()),
|
||||
};
|
||||
|
||||
// Spawn an IoConcurrency sidecar, if enabled.
|
||||
let gate_guard = self
|
||||
@@ -3735,11 +3937,9 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
let mut reqs = req.into_inner();
|
||||
|
||||
let resps = async_stream::try_stream! {
|
||||
let timeline = handles
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?
|
||||
.downgrade();
|
||||
loop {
|
||||
// Wait for the next client request.
|
||||
//
|
||||
// NB: Tonic considers the entire stream to be an in-flight request and will wait
|
||||
// for it to complete before shutting down. React to cancellation between requests.
|
||||
let req = tokio::select! {
|
||||
@@ -3752,16 +3952,44 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
Err(err) => Err(err),
|
||||
},
|
||||
}?;
|
||||
|
||||
let received_at = Instant::now();
|
||||
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
|
||||
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
|
||||
// Process the request, using a closure to capture errors.
|
||||
let process_request = async || {
|
||||
let req = page_api::GetPageRequest::try_from(req)?;
|
||||
|
||||
// Fast path: use the pre-acquired timeline handle.
|
||||
if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) {
|
||||
return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await
|
||||
}
|
||||
|
||||
// The timeline handle is stale. During shard splits, the compute may still be
|
||||
// sending requests to the parent shard. Try to re-route requests to the child
|
||||
// shards, and split any batch requests that straddle multiple child shards.
|
||||
Self::maybe_split_get_page(
|
||||
&ctx,
|
||||
&mut handles,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
shard_index,
|
||||
req,
|
||||
io_concurrency.clone(),
|
||||
received_at,
|
||||
)
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await;
|
||||
yield match result {
|
||||
Ok(resp) => resp,
|
||||
// Convert per-request errors to GetPageResponses as appropriate, or terminate
|
||||
// the stream with a tonic::Status. Log the error regardless, since
|
||||
// ObservabilityLayer can't automatically log stream errors.
|
||||
.await
|
||||
};
|
||||
|
||||
// Return the response. Convert per-request errors to GetPageResponses if
|
||||
// appropriate, or terminate the stream with a tonic::Status.
|
||||
yield match process_request().await {
|
||||
Ok(resp) => resp.into(),
|
||||
Err(status) => {
|
||||
// Log the error, since ObservabilityLayer won't see stream errors.
|
||||
// TODO: it would be nice if we could propagate the get_page() fields here.
|
||||
span.in_scope(|| {
|
||||
warn!("request failed with {:?}: {}", status.code(), status.message());
|
||||
@@ -3781,11 +4009,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetRelSizeRequest>,
|
||||
) -> Result<tonic::Response<proto::GetRelSizeResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
|
||||
let allow_missing = req.allow_missing;
|
||||
|
||||
@@ -3818,11 +4045,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
req: tonic::Request<proto::GetSlruSegmentRequest>,
|
||||
) -> Result<tonic::Response<proto::GetSlruSegmentResponse>, tonic::Status> {
|
||||
let received_at = extract::<ReceivedAt>(&req).0;
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let timeline = self.get_request_timeline_shard_zero(&req).await?;
|
||||
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
|
||||
|
||||
// Validate the request, decorate the span, and convert it to a Pagestream request.
|
||||
Self::ensure_shard_zero(&timeline)?;
|
||||
let req: page_api::GetSlruSegmentRequest = req.into_inner().try_into()?;
|
||||
|
||||
span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
|
||||
@@ -3852,6 +4078,10 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
&self,
|
||||
req: tonic::Request<proto::LeaseLsnRequest>,
|
||||
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
|
||||
// TODO: this won't work during shard splits, as the request is directed at a specific shard
|
||||
// but the parent shard is removed before the split commits and the compute is notified
|
||||
// (which can take several minutes for large tenants). That's also the case for the libpq
|
||||
// implementation, so we keep the behavior for now.
|
||||
let timeline = self.get_request_timeline(&req).await?;
|
||||
let ctx = self.ctx.with_scope_timeline(&timeline);
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use std::collections::{HashMap, HashSet, hash_map};
|
||||
use std::collections::{BTreeSet, HashMap, HashSet, hash_map};
|
||||
use std::ops::{ControlFlow, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -227,6 +227,25 @@ impl Timeline {
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_metadata_bytes: 0,
|
||||
is_importing_pgdata: false,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn begin_modification_for_import(&self, lsn: Lsn) -> DatadirModification
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
DatadirModification {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_metadata_pages: HashMap::new(),
|
||||
pending_data_batch: None,
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_metadata_bytes: 0,
|
||||
is_importing_pgdata: true,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -596,6 +615,50 @@ impl Timeline {
|
||||
self.get_rel_exists_in_reldir(tag, version, None, ctx).await
|
||||
}
|
||||
|
||||
async fn get_rel_exists_in_reldir_v1(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
if let Some((cached_key, dir)) = deserialized_reldir_v1 {
|
||||
if cached_key == key {
|
||||
return Ok(dir.rels.contains(&(tag.relnode, tag.forknum)));
|
||||
} else if cfg!(test) || cfg!(feature = "testing") {
|
||||
panic!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
} else {
|
||||
warn!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
}
|
||||
// Fallback to reading the directory from the datadir.
|
||||
}
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
|
||||
}
|
||||
|
||||
async fn get_rel_exists_in_reldir_v2(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
|
||||
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?).map_err(
|
||||
|_| {
|
||||
PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid reldir key: decode failed, {}",
|
||||
key
|
||||
))
|
||||
},
|
||||
)?;
|
||||
let exists_v2 = buf == RelDirExists::Exists;
|
||||
Ok(exists_v2)
|
||||
}
|
||||
|
||||
/// Does the relation exist? With a cached deserialized `RelDirectory`.
|
||||
///
|
||||
/// There are some cases where the caller loops across all relations. In that specific case,
|
||||
@@ -627,45 +690,134 @@ impl Timeline {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// Read path: first read the new reldir keyspace. Early return if the relation exists.
|
||||
// Otherwise, read the old reldir keyspace.
|
||||
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
|
||||
let (v2_status, migrated_lsn) = self.get_rel_size_v2_status();
|
||||
|
||||
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
|
||||
self.get_rel_size_v2_status()
|
||||
{
|
||||
// fetch directory listing (new)
|
||||
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
|
||||
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
|
||||
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
let exists_v2 = buf == RelDirExists::Exists;
|
||||
// Fast path: if the relation exists in the new format, return true.
|
||||
// TODO: we should have a verification mode that checks both keyspaces
|
||||
// to ensure the relation only exists in one of them.
|
||||
if exists_v2 {
|
||||
return Ok(true);
|
||||
match v2_status {
|
||||
RelSizeMigration::Legacy => {
|
||||
let v1_exists = self
|
||||
.get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx)
|
||||
.await?;
|
||||
Ok(v1_exists)
|
||||
}
|
||||
RelSizeMigration::Migrating | RelSizeMigration::Migrated
|
||||
if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) =>
|
||||
{
|
||||
// For requests below the migrated LSN, we still use the v1 read path.
|
||||
let v1_exists = self
|
||||
.get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx)
|
||||
.await?;
|
||||
Ok(v1_exists)
|
||||
}
|
||||
RelSizeMigration::Migrating => {
|
||||
let v1_exists = self
|
||||
.get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx)
|
||||
.await?;
|
||||
let v2_exists_res = self.get_rel_exists_in_reldir_v2(tag, version, ctx).await;
|
||||
match v2_exists_res {
|
||||
Ok(v2_exists) if v1_exists == v2_exists => {}
|
||||
Ok(v2_exists) => {
|
||||
tracing::warn!(
|
||||
"inconsistent v1/v2 reldir keyspace for rel {}: v1_exists={}, v2_exists={}",
|
||||
tag,
|
||||
v1_exists,
|
||||
v2_exists
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to get rel exists in v2: {e}");
|
||||
}
|
||||
}
|
||||
Ok(v1_exists)
|
||||
}
|
||||
RelSizeMigration::Migrated => {
|
||||
let v2_exists = self.get_rel_exists_in_reldir_v2(tag, version, ctx).await?;
|
||||
Ok(v2_exists)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fetch directory listing (old)
|
||||
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
|
||||
if let Some((cached_key, dir)) = deserialized_reldir_v1 {
|
||||
if cached_key == key {
|
||||
return Ok(dir.rels.contains(&(tag.relnode, tag.forknum)));
|
||||
} else if cfg!(test) || cfg!(feature = "testing") {
|
||||
panic!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
} else {
|
||||
warn!("cached reldir key mismatch: {cached_key} != {key}");
|
||||
}
|
||||
// Fallback to reading the directory from the datadir.
|
||||
}
|
||||
async fn list_rels_v1(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
|
||||
Ok(exists_v1)
|
||||
let rels_v1: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
Ok(rels_v1)
|
||||
}
|
||||
|
||||
async fn list_rels_v2(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
let results = self
|
||||
.scan(
|
||||
KeySpace::single(key_range),
|
||||
version.get_lsn(),
|
||||
ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
let mut rels = HashSet::new();
|
||||
for (key, val) in results {
|
||||
let val = RelDirExists::decode(&val?).map_err(|_| {
|
||||
PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid reldir key: decode failed, {}",
|
||||
key
|
||||
))
|
||||
})?;
|
||||
if key.field6 != 1 {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid reldir key: field6 != 1, {}",
|
||||
key
|
||||
)));
|
||||
}
|
||||
if key.field2 != spcnode {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid reldir key: field2 != spcnode, {}",
|
||||
key
|
||||
)));
|
||||
}
|
||||
if key.field3 != dbnode {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid reldir key: field3 != dbnode, {}",
|
||||
key
|
||||
)));
|
||||
}
|
||||
let tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
if val == RelDirExists::Removed {
|
||||
debug_assert!(!rels.contains(&tag), "removed reltag in v2");
|
||||
continue;
|
||||
}
|
||||
let did_not_contain = rels.insert(tag);
|
||||
debug_assert!(did_not_contain, "duplicate reltag in v2");
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
@@ -683,60 +835,45 @@ impl Timeline {
|
||||
version: Version<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing (old)
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let (v2_status, migrated_lsn) = self.get_rel_size_v2_status();
|
||||
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
let rels_v1: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
|
||||
return Ok(rels_v1);
|
||||
}
|
||||
|
||||
// scan directory listing (new), merge with the old results
|
||||
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
let results = self
|
||||
.scan(
|
||||
KeySpace::single(key_range),
|
||||
version.get_lsn(),
|
||||
ctx,
|
||||
io_concurrency,
|
||||
)
|
||||
.await?;
|
||||
let mut rels = rels_v1;
|
||||
for (key, val) in results {
|
||||
let val = RelDirExists::decode(&val?)
|
||||
.map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
assert_eq!(key.field6, 1);
|
||||
assert_eq!(key.field2, spcnode);
|
||||
assert_eq!(key.field3, dbnode);
|
||||
let tag = RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: key.field4,
|
||||
forknum: key.field5,
|
||||
};
|
||||
if val == RelDirExists::Removed {
|
||||
debug_assert!(!rels.contains(&tag), "removed reltag in v2");
|
||||
continue;
|
||||
match v2_status {
|
||||
RelSizeMigration::Legacy => {
|
||||
let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?;
|
||||
Ok(rels_v1)
|
||||
}
|
||||
RelSizeMigration::Migrating | RelSizeMigration::Migrated
|
||||
if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) =>
|
||||
{
|
||||
// For requests below the migrated LSN, we still use the v1 read path.
|
||||
let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?;
|
||||
Ok(rels_v1)
|
||||
}
|
||||
RelSizeMigration::Migrating => {
|
||||
let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?;
|
||||
let rels_v2_res = self.list_rels_v2(spcnode, dbnode, version, ctx).await;
|
||||
match rels_v2_res {
|
||||
Ok(rels_v2) if rels_v1 == rels_v2 => {}
|
||||
Ok(rels_v2) => {
|
||||
tracing::warn!(
|
||||
"inconsistent v1/v2 reldir keyspace for db {} {}: v1_rels.len()={}, v2_rels.len()={}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
rels_v1.len(),
|
||||
rels_v2.len()
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to list rels in v2: {e}");
|
||||
}
|
||||
}
|
||||
Ok(rels_v1)
|
||||
}
|
||||
RelSizeMigration::Migrated => {
|
||||
let rels_v2 = self.list_rels_v2(spcnode, dbnode, version, ctx).await?;
|
||||
Ok(rels_v2)
|
||||
}
|
||||
let did_not_contain = rels.insert(tag);
|
||||
debug_assert!(did_not_contain, "duplicate reltag in v2");
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Get the whole SLRU segment
|
||||
@@ -1258,10 +1395,10 @@ impl Timeline {
|
||||
let mut dbdir_cnt = 0;
|
||||
let mut rel_cnt = 0;
|
||||
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for &(spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
dbdir_cnt += 1;
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
|
||||
.list_rels(spcnode, dbnode, Version::at(lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
rel_cnt += 1;
|
||||
@@ -1566,6 +1703,9 @@ pub struct DatadirModification<'a> {
|
||||
|
||||
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
|
||||
pending_metadata_bytes: usize,
|
||||
|
||||
/// Whether we are importing a pgdata directory.
|
||||
is_importing_pgdata: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -1578,6 +1718,14 @@ pub enum MetricsUpdate {
|
||||
Sub(u64),
|
||||
}
|
||||
|
||||
/// Controls the behavior of the reldir keyspace.
|
||||
pub struct RelDirMode {
|
||||
// Whether we can read the v2 keyspace or not.
|
||||
current_status: RelSizeMigration,
|
||||
// Whether we should initialize the v2 keyspace or not.
|
||||
initialize: bool,
|
||||
}
|
||||
|
||||
impl DatadirModification<'_> {
|
||||
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
||||
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
||||
@@ -1933,30 +2081,49 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
|
||||
/// we enable it, we also need to persist it in `index_part.json`.
|
||||
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
|
||||
let status = self.tline.get_rel_size_v2_status();
|
||||
/// we enable it, we also need to persist it in `index_part.json` (initialize is true).
|
||||
///
|
||||
/// As this function is only used on the write path, we do not need to read the migrated_at
|
||||
/// field.
|
||||
pub fn maybe_enable_rel_size_v2(&mut self, is_create: bool) -> anyhow::Result<RelDirMode> {
|
||||
// TODO: define the behavior of the tenant-level config flag and use feature flag to enable this feature
|
||||
|
||||
let (status, _) = self.tline.get_rel_size_v2_status();
|
||||
let config = self.tline.get_rel_size_v2_enabled();
|
||||
match (config, status) {
|
||||
(false, RelSizeMigration::Legacy) => {
|
||||
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
|
||||
Ok(false)
|
||||
Ok(RelDirMode {
|
||||
current_status: RelSizeMigration::Legacy,
|
||||
initialize: false,
|
||||
})
|
||||
}
|
||||
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
(false, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
Ok(true)
|
||||
Ok(RelDirMode {
|
||||
current_status: status,
|
||||
initialize: false,
|
||||
})
|
||||
}
|
||||
(true, RelSizeMigration::Legacy) => {
|
||||
// The first time we enable it, we need to persist it in `index_part.json`
|
||||
self.tline
|
||||
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
|
||||
tracing::info!("enabled rel_size_v2");
|
||||
Ok(true)
|
||||
// The caller should update the reldir status once the initialization is done.
|
||||
//
|
||||
// Only initialize the v2 keyspace on new relation creation. No initialization
|
||||
// during `timeline_create` (TODO: fix this, we should allow, but currently it
|
||||
// hits consistency issues).
|
||||
Ok(RelDirMode {
|
||||
current_status: RelSizeMigration::Legacy,
|
||||
initialize: is_create && !self.is_importing_pgdata,
|
||||
})
|
||||
}
|
||||
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
(true, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
// and we don't need to do anything
|
||||
Ok(true)
|
||||
Ok(RelDirMode {
|
||||
current_status: status,
|
||||
initialize: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1969,8 +2136,8 @@ impl DatadirModification<'_> {
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
let v2_mode = self
|
||||
.maybe_enable_rel_size_v2(false)
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
@@ -1986,17 +2153,19 @@ impl DatadirModification<'_> {
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
}
|
||||
if r.is_none() {
|
||||
// Create RelDirectory
|
||||
// TODO: if we have fully migrated to v2, no need to create this directory
|
||||
if v2_mode.current_status != RelSizeMigration::Legacy {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
}
|
||||
|
||||
// Create RelDirectory in v1 keyspace. TODO: if we have fully migrated to v2, no need to create this directory.
|
||||
// Some code path relies on this directory to be present. We should remove it once we starts to set tenants to
|
||||
// `RelSizeMigration::Migrated` state (currently we don't, all tenants will have `RelSizeMigration::Migrating`).
|
||||
let buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: HashSet::new(),
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
|
||||
if v2_enabled {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
}
|
||||
self.put(
|
||||
rel_dir_to_key(spcnode, dbnode),
|
||||
Value::Image(Bytes::from(buf)),
|
||||
@@ -2103,6 +2272,109 @@ impl DatadirModification<'_> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn initialize_rel_size_v2_keyspace(
|
||||
&mut self,
|
||||
ctx: &RequestContext,
|
||||
dbdir: &DbDirectory,
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Copy everything from relv1 to relv2; TODO: check if there's any key in the v2 keyspace, if so, abort.
|
||||
tracing::info!("initializing rel_size_v2 keyspace");
|
||||
let mut rel_cnt = 0;
|
||||
// relmap_exists (the value of dbdirs hashmap) does not affect the migration: we need to copy things over anyways
|
||||
for &(spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
let rel_dir_key = rel_dir_to_key(spcnode, dbnode);
|
||||
let rel_dir = RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?;
|
||||
for (relnode, forknum) in rel_dir.rels {
|
||||
let sparse_rel_dir_key = rel_tag_sparse_key(spcnode, dbnode, relnode, forknum);
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
Value::Image(RelDirExists::Exists.encode()),
|
||||
);
|
||||
tracing::info!(
|
||||
"migrated rel_size_v2: {}",
|
||||
RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum
|
||||
}
|
||||
);
|
||||
rel_cnt += 1;
|
||||
}
|
||||
}
|
||||
tracing::info!(
|
||||
"initialized rel_size_v2 keyspace at lsn {}: migrated {} relations",
|
||||
self.lsn,
|
||||
rel_cnt
|
||||
);
|
||||
self.tline
|
||||
.update_rel_size_v2_status(RelSizeMigration::Migrating, Some(self.lsn))
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
Ok::<_, WalIngestError>(())
|
||||
}
|
||||
|
||||
async fn put_rel_creation_v1(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
dbdir_exists: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Reldir v1 write path
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if !dbdir_exists {
|
||||
// Create the RelDirectory
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn put_rel_creation_v2(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
dbdir_exists: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Reldir v2 write path
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
if val == RelDirExists::Exists {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
Value::Image(RelDirExists::Exists.encode()),
|
||||
);
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a relation fork.
|
||||
///
|
||||
/// 'nblocks' is the initial size.
|
||||
@@ -2136,66 +2408,31 @@ impl DatadirModification<'_> {
|
||||
true
|
||||
};
|
||||
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if !dbdir_exists {
|
||||
// Create the RelDirectory
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
let mut v2_mode = self
|
||||
.maybe_enable_rel_size_v2(true)
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
if v2_mode.initialize {
|
||||
if let Err(e) = self.initialize_rel_size_v2_keyspace(ctx, &dbdir).await {
|
||||
tracing::warn!("error initializing rel_size_v2 keyspace: {}", e);
|
||||
// TODO: circuit breaker so that it won't retry forever
|
||||
} else {
|
||||
v2_mode.current_status = RelSizeMigration::Migrating;
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
if val == RelDirExists::Exists {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
|
||||
if v2_mode.current_status != RelSizeMigration::Migrated {
|
||||
self.put_rel_creation_v1(rel, dbdir_exists, ctx).await?;
|
||||
}
|
||||
|
||||
if v2_mode.current_status != RelSizeMigration::Legacy {
|
||||
let write_v2_res = self.put_rel_creation_v2(rel, dbdir_exists, ctx).await;
|
||||
if let Err(e) = write_v2_res {
|
||||
if v2_mode.current_status == RelSizeMigration::Migrated {
|
||||
return Err(e);
|
||||
}
|
||||
tracing::warn!("error writing rel_size_v2 keyspace: {}", e);
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
Value::Image(RelDirExists::Exists.encode()),
|
||||
);
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
// We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
|
||||
// TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
|
||||
// will be key not found errors if we don't create an empty one for rel_size_v2.
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
|
||||
);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
|
||||
}
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
);
|
||||
}
|
||||
|
||||
// Put size
|
||||
@@ -2270,15 +2507,12 @@ impl DatadirModification<'_> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Drop some relations
|
||||
pub(crate) async fn put_rel_drops(
|
||||
async fn put_rel_drop_v1(
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
) -> Result<BTreeSet<RelTag>, WalIngestError> {
|
||||
let mut dropped_rels = BTreeSet::new();
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2290,25 +2524,8 @@ impl DatadirModification<'_> {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
|
||||
dirty = true;
|
||||
dropped_rels.insert(rel_tag);
|
||||
true
|
||||
} else if v2_enabled {
|
||||
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
|
||||
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
|
||||
// logic).
|
||||
let key =
|
||||
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
if val == RelDirExists::Exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
// put tombstone
|
||||
self.put(key, Value::Image(RelDirExists::Removed.encode()));
|
||||
// no need to set dirty to true
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
};
|
||||
@@ -2331,7 +2548,67 @@ impl DatadirModification<'_> {
|
||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||
}
|
||||
}
|
||||
Ok(dropped_rels)
|
||||
}
|
||||
|
||||
async fn put_rel_drop_v2(
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeSet<RelTag>, WalIngestError> {
|
||||
let mut dropped_rels = BTreeSet::new();
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
for rel_tag in rel_tags {
|
||||
let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
if val == RelDirExists::Exists {
|
||||
dropped_rels.insert(rel_tag);
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
// put tombstone
|
||||
self.put(key, Value::Image(RelDirExists::Removed.encode()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(dropped_rels)
|
||||
}
|
||||
|
||||
/// Drop some relations
|
||||
pub(crate) async fn put_rel_drops(
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_mode = self
|
||||
.maybe_enable_rel_size_v2(false)
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
match v2_mode.current_status {
|
||||
RelSizeMigration::Legacy => {
|
||||
self.put_rel_drop_v1(drop_relations, ctx).await?;
|
||||
}
|
||||
RelSizeMigration::Migrating => {
|
||||
let dropped_rels_v1 = self.put_rel_drop_v1(drop_relations.clone(), ctx).await?;
|
||||
let dropped_rels_v2_res = self.put_rel_drop_v2(drop_relations, ctx).await;
|
||||
match dropped_rels_v2_res {
|
||||
Ok(dropped_rels_v2) => {
|
||||
if dropped_rels_v1 != dropped_rels_v2 {
|
||||
tracing::warn!(
|
||||
"inconsistent v1/v2 rel drop: dropped_rels_v1.len()={}, dropped_rels_v2.len()={}",
|
||||
dropped_rels_v1.len(),
|
||||
dropped_rels_v2.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("error dropping rels: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
RelSizeMigration::Migrated => {
|
||||
self.put_rel_drop_v2(drop_relations, ctx).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -1205,6 +1205,7 @@ impl TenantShard {
|
||||
idempotency.clone(),
|
||||
index_part.gc_compaction.clone(),
|
||||
index_part.rel_size_migration.clone(),
|
||||
index_part.rel_size_migrated_at,
|
||||
ctx,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
@@ -2584,6 +2585,7 @@ impl TenantShard {
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -2913,6 +2915,7 @@ impl TenantShard {
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -4342,6 +4345,7 @@ impl TenantShard {
|
||||
create_idempotency: CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Timeline>, RequestContext)> {
|
||||
let state = match cause {
|
||||
@@ -4376,6 +4380,7 @@ impl TenantShard {
|
||||
create_idempotency,
|
||||
gc_compaction_state,
|
||||
rel_size_v2_status,
|
||||
rel_size_migrated_at,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
|
||||
@@ -5085,6 +5090,7 @@ impl TenantShard {
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
|
||||
let (rel_size_v2_status, rel_size_migrated_at) = src_timeline.get_rel_size_v2_status();
|
||||
let (uninitialized_timeline, _timeline_ctx) = self
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
@@ -5092,7 +5098,8 @@ impl TenantShard {
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
Some(src_timeline.get_rel_size_v2_status()),
|
||||
Some(rel_size_v2_status),
|
||||
rel_size_migrated_at,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5379,6 +5386,7 @@ impl TenantShard {
|
||||
pgdata_lsn,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -5462,14 +5470,17 @@ impl TenantShard {
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
resources
|
||||
.remote_client
|
||||
.init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
|
||||
resources.remote_client.init_upload_queue_for_empty_remote(
|
||||
new_metadata,
|
||||
rel_size_v2_status.clone(),
|
||||
rel_size_migrated_at,
|
||||
)?;
|
||||
|
||||
let (timeline_struct, timeline_ctx) = self
|
||||
.create_timeline_struct(
|
||||
@@ -5482,6 +5493,7 @@ impl TenantShard {
|
||||
create_guard.idempotency.clone(),
|
||||
None,
|
||||
rel_size_v2_status,
|
||||
rel_size_migrated_at,
|
||||
ctx,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
@@ -826,6 +826,18 @@ impl TenantManager {
|
||||
peek_slot.is_some()
|
||||
}
|
||||
|
||||
/// Returns whether a local shard exists that's a child of the given tenant shard. Note that
|
||||
/// this just checks for any shard with a larger shard count, and it may not be a direct child
|
||||
/// of the given shard (their keyspace may not overlap).
|
||||
pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool {
|
||||
match &*self.tenants.read().unwrap() {
|
||||
TenantsMap::Initializing => false,
|
||||
TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.any(|(tsid, _)| tsid.shard_count > shard_index.shard_count),
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
@@ -1522,6 +1534,13 @@ impl TenantManager {
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
//
|
||||
// TODO: keeping the parent as InProgress while spawning the children causes read
|
||||
// unavailability, as we can't acquire a new timeline handle for it (existing handles appear
|
||||
// to still work though, even downgraded ones). The parent should be available for reads
|
||||
// until the children are ready -- potentially until *all* subsplits across all parent
|
||||
// shards are complete and the compute has been notified. See:
|
||||
// <https://databricks.atlassian.net/browse/LKB-672>.
|
||||
drop(tenant);
|
||||
let mut parent_slot_guard =
|
||||
self.tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
|
||||
@@ -443,7 +443,8 @@ impl RemoteTimelineClient {
|
||||
pub fn init_upload_queue_for_empty_remote(
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
rel_size_v2_migration: Option<RelSizeMigration>,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
@@ -455,7 +456,8 @@ impl RemoteTimelineClient {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
let initialized_queue =
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
|
||||
initialized_queue.dirty.rel_size_migration = rel_size_v2_migration;
|
||||
initialized_queue.dirty.rel_size_migrated_at = rel_size_migrated_at;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -994,10 +996,12 @@ impl RemoteTimelineClient {
|
||||
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
|
||||
self: &Arc<Self>,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
|
||||
upload_queue.dirty.rel_size_migrated_at = rel_size_migrated_at;
|
||||
// TODO: allow this operation to bypass the validation check because we might upload the index part
|
||||
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
|
||||
// upload will include the flag.
|
||||
|
||||
@@ -114,6 +114,11 @@ pub struct IndexPart {
|
||||
/// The timestamp when the timeline was marked invisible in synthetic size calculations.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) marked_invisible_at: Option<NaiveDateTime>,
|
||||
|
||||
/// The LSN at which we started the rel size migration. Accesses below this LSN should be
|
||||
/// processed with the v1 read path. Usually this LSN should be set together with `rel_size_migration`.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) rel_size_migrated_at: Option<Lsn>,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
@@ -142,10 +147,12 @@ impl IndexPart {
|
||||
/// - 12: +l2_lsn
|
||||
/// - 13: +gc_compaction
|
||||
/// - 14: +marked_invisible_at
|
||||
const LATEST_VERSION: usize = 14;
|
||||
/// - 15: +rel_size_migrated_at
|
||||
const LATEST_VERSION: usize = 15;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] =
|
||||
&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -165,6 +172,7 @@ impl IndexPart {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,6 +483,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -524,6 +533,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -574,6 +584,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -627,6 +638,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
|
||||
@@ -675,6 +687,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -726,6 +739,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -782,6 +796,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -843,6 +858,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -905,6 +921,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -972,6 +989,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1052,6 +1070,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1133,6 +1152,7 @@ mod tests {
|
||||
l2_lsn: None,
|
||||
gc_compaction: None,
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1220,6 +1240,7 @@ mod tests {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: None,
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
@@ -1308,6 +1329,97 @@ mod tests {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
rel_size_migrated_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v15_rel_size_migrated_at_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 15,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
},
|
||||
"import_pgdata": {
|
||||
"V1": {
|
||||
"Done": {
|
||||
"idempotency_key": "specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5",
|
||||
"started_at": "2024-11-13T09:23:42.123",
|
||||
"finished_at": "2024-11-13T09:42:23.123"
|
||||
}
|
||||
}
|
||||
},
|
||||
"rel_size_migration": "legacy",
|
||||
"l2_lsn": "0/16960E8",
|
||||
"gc_compaction": {
|
||||
"last_completed_lsn": "0/16960E8"
|
||||
},
|
||||
"marked_invisible_at": "2023-07-31T09:00:00.123",
|
||||
"rel_size_migrated_at": "0/16960E8"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 15,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
PgMajorVersion::PG14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
import_pgdata: Some(import_pgdata::index_part_format::Root::V1(import_pgdata::index_part_format::V1::Done(import_pgdata::index_part_format::Done{
|
||||
started_at: parse_naive_datetime("2024-11-13T09:23:42.123000000"),
|
||||
finished_at: parse_naive_datetime("2024-11-13T09:42:23.123000000"),
|
||||
idempotency_key: import_pgdata::index_part_format::IdempotencyKey::new("specified-by-client-218a5213-5044-4562-a28d-d024c5f057f5".to_string()),
|
||||
}))),
|
||||
rel_size_migration: Some(RelSizeMigration::Legacy),
|
||||
l2_lsn: Some("0/16960E8".parse::<Lsn>().unwrap()),
|
||||
gc_compaction: Some(GcCompactionState {
|
||||
last_completed_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
}),
|
||||
marked_invisible_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
rel_size_migrated_at: Some("0/16960E8".parse::<Lsn>().unwrap()),
|
||||
};
|
||||
|
||||
let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
|
||||
|
||||
@@ -70,7 +70,7 @@ use tracing::*;
|
||||
use utils::generation::Generation;
|
||||
use utils::guard_arc_swap::GuardArcSwap;
|
||||
use utils::id::TimelineId;
|
||||
use utils::logging::{MonitorSlowFutureCallback, monitor_slow_future};
|
||||
use utils::logging::{MonitorSlowFutureCallback, log_slow, monitor_slow_future};
|
||||
use utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::rate_limit::RateLimit;
|
||||
@@ -397,6 +397,11 @@ pub struct Timeline {
|
||||
/// If true, the last compaction failed.
|
||||
compaction_failed: AtomicBool,
|
||||
|
||||
/// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline.
|
||||
/// We need to feed this information back to the Safekeeper and postgres for them to take the
|
||||
/// appropriate action.
|
||||
corruption_detected: AtomicBool,
|
||||
|
||||
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
|
||||
l0_compaction_trigger: Arc<Notify>,
|
||||
|
||||
@@ -441,7 +446,7 @@ pub struct Timeline {
|
||||
/// heatmap on demand.
|
||||
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
|
||||
|
||||
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
|
||||
pub(crate) rel_size_v2_status: ArcSwap<(Option<RelSizeMigration>, Option<Lsn>)>,
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore,
|
||||
|
||||
@@ -2894,12 +2899,9 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
|
||||
}
|
||||
|
||||
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
|
||||
self.rel_size_v2_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.map(|s| s.as_ref().clone())
|
||||
.unwrap_or(RelSizeMigration::Legacy)
|
||||
pub(crate) fn get_rel_size_v2_status(&self) -> (RelSizeMigration, Option<Lsn>) {
|
||||
let (status, migrated_at) = self.rel_size_v2_status.load().as_ref().clone();
|
||||
(status.unwrap_or(RelSizeMigration::Legacy), migrated_at)
|
||||
}
|
||||
|
||||
fn get_compaction_upper_limit(&self) -> usize {
|
||||
@@ -3174,6 +3176,7 @@ impl Timeline {
|
||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -3312,6 +3315,7 @@ impl Timeline {
|
||||
|
||||
compaction_lock: tokio::sync::Mutex::default(),
|
||||
compaction_failed: AtomicBool::default(),
|
||||
corruption_detected: AtomicBool::default(),
|
||||
l0_compaction_trigger: resources.l0_compaction_trigger,
|
||||
gc_lock: tokio::sync::Mutex::default(),
|
||||
|
||||
@@ -3338,7 +3342,10 @@ impl Timeline {
|
||||
|
||||
heatmap_layers_downloader: Mutex::new(None),
|
||||
|
||||
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
|
||||
rel_size_v2_status: ArcSwap::from_pointee((
|
||||
rel_size_v2_status,
|
||||
rel_size_migrated_at,
|
||||
)),
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
|
||||
|
||||
@@ -3426,11 +3433,17 @@ impl Timeline {
|
||||
pub(crate) fn update_rel_size_v2_status(
|
||||
&self,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
rel_size_migrated_at: Option<Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
self.rel_size_v2_status
|
||||
.store(Some(Arc::new(rel_size_v2_status.clone())));
|
||||
self.rel_size_v2_status.store(Arc::new((
|
||||
Some(rel_size_v2_status.clone()),
|
||||
rel_size_migrated_at,
|
||||
)));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
|
||||
.schedule_index_upload_for_rel_size_v2_status_update(
|
||||
rel_size_v2_status,
|
||||
rel_size_migrated_at,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
|
||||
@@ -5997,6 +6010,17 @@ impl Timeline {
|
||||
)))
|
||||
});
|
||||
|
||||
// Begin Hadron
|
||||
//
|
||||
fail_point!("create-image-layer-fail-simulated-corruption", |_| {
|
||||
self.corruption_detected
|
||||
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
Err(CreateImageLayersError::Other(anyhow::anyhow!(
|
||||
"failpoint create-image-layer-fail-simulated-corruption"
|
||||
)))
|
||||
});
|
||||
// End Hadron
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.gate
|
||||
@@ -6891,7 +6915,13 @@ impl Timeline {
|
||||
|
||||
write_guard.store_and_unlock(new_gc_cutoff)
|
||||
};
|
||||
waitlist.wait().await;
|
||||
let waitlist_wait_fut = std::pin::pin!(waitlist.wait());
|
||||
log_slow(
|
||||
"applied_gc_cutoff waitlist wait",
|
||||
Duration::from_secs(30),
|
||||
waitlist_wait_fut,
|
||||
)
|
||||
.await;
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
@@ -7136,6 +7166,7 @@ impl Timeline {
|
||||
critical_timeline!(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
Some(&self.corruption_detected),
|
||||
"walredo failure during page reconstruction: {err:?}"
|
||||
);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user