mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 09:52:54 +00:00
Merge pull request #4000 from neondatabase/releases/2023-04-11
Release 2023-04-11
This commit is contained in:
48
.github/actions/allure-report/action.yml
vendored
48
.github/actions/allure-report/action.yml
vendored
@@ -15,10 +15,32 @@ outputs:
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
report-json-url:
|
||||
description: 'Allure report JSON URL'
|
||||
value: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
|
||||
steps:
|
||||
# We're using some of env variables quite offen, so let's set them once.
|
||||
#
|
||||
# It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
|
||||
#
|
||||
# - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
|
||||
# - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
|
||||
#
|
||||
- name: Set common environment variables
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV
|
||||
echo "BUCKET=${BUCKET}" >> $GITHUB_ENV
|
||||
echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
- name: Validate input parameters
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
@@ -76,16 +98,14 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.19.0
|
||||
ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
|
||||
ALLURE_VERSION: 2.21.0
|
||||
ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
|
||||
|
||||
- name: Upload Allure results
|
||||
if: ${{ inputs.action == 'store' }}
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
@@ -104,7 +124,7 @@ runs:
|
||||
EOF
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
|
||||
TEST_SELECTION=${{ inputs.test_selection }}
|
||||
BUILD_TYPE=${{ inputs.build_type }}
|
||||
BUILD_TYPE=${BUILD_TYPE}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
@@ -113,13 +133,12 @@ runs:
|
||||
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
|
||||
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
|
||||
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
- name: Acquire Allure lock
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
@@ -149,8 +168,6 @@ runs:
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Get previously uploaded data for this run
|
||||
@@ -186,24 +203,24 @@ runs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ./index.html
|
||||
cat <<EOF > ${TEST_OUTPUT}/allure/index.html
|
||||
<!DOCTYPE html>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<title>Redirecting to ${REPORT_URL}</title>
|
||||
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
|
||||
EOF
|
||||
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Release Allure lock
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
@@ -212,11 +229,16 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
rm -rf ${TEST_OUTPUT}/allure
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
|
||||
16
.github/actions/neon-project-create/action.yml
vendored
16
.github/actions/neon-project-create/action.yml
vendored
@@ -14,6 +14,12 @@ inputs:
|
||||
api_host:
|
||||
desctiption: 'Neon API host'
|
||||
default: console.stage.neon.tech
|
||||
provisioner:
|
||||
desctiption: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
dsn:
|
||||
@@ -31,6 +37,10 @@ runs:
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
|
||||
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
|
||||
fi
|
||||
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects" \
|
||||
--fail \
|
||||
@@ -42,6 +52,9 @@ runs:
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"pg_version\": ${POSTGRES_VERSION},
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"provisioner\": \"${PROVISIONER}\",
|
||||
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
||||
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
||||
\"settings\": { }
|
||||
}
|
||||
}")
|
||||
@@ -62,3 +75,6 @@ runs:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
PROVISIONER: ${{ inputs.provisioner }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
|
||||
12
.github/actions/run-python-test-set/action.yml
vendored
12
.github/actions/run-python-test-set/action.yml
vendored
@@ -44,6 +44,10 @@ inputs:
|
||||
description: 'Secret access key'
|
||||
required: false
|
||||
default: ''
|
||||
rerun_flaky:
|
||||
description: 'Whether to rerun flaky tests'
|
||||
required: false
|
||||
default: 'false'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -101,6 +105,7 @@ runs:
|
||||
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
|
||||
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
|
||||
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
|
||||
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
@@ -143,6 +148,13 @@ runs:
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [ "${RERUN_FLAKY}" == "true" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
|
||||
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
|
||||
6
.github/ansible/scripts/init_pageserver.sh
vendored
6
.github/ansible/scripts/init_pageserver.sh
vendored
@@ -3,6 +3,8 @@
|
||||
# fetch params from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
|
||||
INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type)
|
||||
DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}')
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
@@ -18,7 +20,9 @@ cat <<EOF | tee /tmp/payload
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898,
|
||||
"active": false,
|
||||
"availability_zone_id": "${AZ_ID}"
|
||||
"availability_zone_id": "${AZ_ID}",
|
||||
"disk_size": ${DISK_SIZE},
|
||||
"instance_type": "${INSTANCE_TYPE}"
|
||||
}
|
||||
EOF
|
||||
|
||||
|
||||
@@ -23,6 +23,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"]
|
||||
sentryEnvironment: "staging"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
|
||||
|
||||
@@ -24,6 +24,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
|
||||
@@ -24,6 +24,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
|
||||
@@ -24,6 +24,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
|
||||
@@ -24,6 +24,7 @@ settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
|
||||
|
||||
161
.github/workflows/benchmarking.yml
vendored
161
.github/workflows/benchmarking.yml
vendored
@@ -107,25 +107,65 @@ jobs:
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
generate-matrices:
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
# Available platforms:
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
|
||||
|
||||
steps:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"include": [
|
||||
{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" }
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Generate matrix for OLAP benchmarks
|
||||
id: olap-compare-matrix
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neon-captest-reuse"
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
pgbench-compare:
|
||||
needs: [ generate-matrices ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-new: Run pgbench in a freshly created project
|
||||
# neon-captest-reuse: Same, but reusing existing project
|
||||
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
|
||||
db_size: [ 10gb ]
|
||||
runner: [ us-east-2 ]
|
||||
include:
|
||||
- platform: neon-captest-prefetch
|
||||
db_size: 50gb
|
||||
runner: us-east-2
|
||||
- platform: rds-aurora
|
||||
db_size: 50gb
|
||||
runner: us-east-2
|
||||
matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}}
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
@@ -137,7 +177,7 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -160,13 +200,14 @@ jobs:
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform)
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
@@ -175,7 +216,7 @@ jobs:
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new | neon-captest-prefetch)
|
||||
neon-captest-new | neon-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -185,7 +226,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -194,17 +235,6 @@ jobs:
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -276,15 +306,11 @@ jobs:
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: success() || failure()
|
||||
needs: [ pgbench-compare ]
|
||||
needs: [ generate-matrices, pgbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
@@ -320,7 +346,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-prefetch)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -330,7 +356,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -339,17 +365,6 @@ jobs:
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -387,15 +402,11 @@ jobs:
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: success() || failure()
|
||||
needs: [ clickbench-compare ]
|
||||
needs: [ generate-matrices, clickbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
@@ -431,7 +442,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-prefetch)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -441,7 +452,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -450,17 +461,6 @@ jobs:
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -492,15 +492,11 @@ jobs:
|
||||
|
||||
user-examples-compare:
|
||||
if: success() || failure()
|
||||
needs: [ tpch-compare ]
|
||||
needs: [ generate-matrices, tpch-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
||||
matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
@@ -536,7 +532,7 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-prefetch)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -546,7 +542,7 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -555,17 +551,6 @@ jobs:
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
|
||||
88
.github/workflows/build_and_test.yml
vendored
88
.github/workflows/build_and_test.yml
vendored
@@ -335,6 +335,10 @@ jobs:
|
||||
real_s3_region: us-west-2
|
||||
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
|
||||
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
|
||||
rerun_flaky: true
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
@@ -371,42 +375,90 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
merge-allure-report:
|
||||
create-test-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: ${{ !cancelled() }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Create Allure report (debug)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report-debug
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ matrix.build_type }}
|
||||
build_type: debug
|
||||
|
||||
- name: Create Allure report (release)
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report-release
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: release
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: >
|
||||
!cancelled() &&
|
||||
github.event_name == 'pull_request' && (
|
||||
steps.create-allure-report-debug.outputs.report-url ||
|
||||
steps.create-allure-report-release.outputs.report-url
|
||||
)
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const reports = [{
|
||||
buildType: "debug",
|
||||
reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
|
||||
}, {
|
||||
buildType: "release",
|
||||
reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
|
||||
jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}",
|
||||
}]
|
||||
|
||||
const script = require("./scripts/pr-comment-test-report.js")
|
||||
await script({
|
||||
github,
|
||||
context,
|
||||
fetch,
|
||||
reports,
|
||||
})
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
if: >
|
||||
!cancelled() && (
|
||||
steps.create-allure-report-debug.outputs.report-url ||
|
||||
steps.create-allure-report-release.outputs.report-url
|
||||
)
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
|
||||
REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
|
||||
./scripts/pysync
|
||||
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
|
||||
if [ -z "$report_url" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
|
||||
BUILD_TYPE=debug
|
||||
else
|
||||
BUILD_TYPE=release
|
||||
fi
|
||||
|
||||
curl --fail --output suites.json "${report_url}"
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
done
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
|
||||
14
Cargo.lock
generated
14
Cargo.lock
generated
@@ -841,6 +841,18 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
@@ -848,6 +860,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"clap 4.1.4",
|
||||
"compute_api",
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
@@ -3354,6 +3367,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
|
||||
@@ -101,6 +101,7 @@ test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
tokio-rustls = "0.23"
|
||||
tokio-stream = "0.1"
|
||||
@@ -132,6 +133,7 @@ tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df6
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
|
||||
@@ -38,6 +38,7 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
@@ -300,6 +301,27 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "timescaledb-pg-build"
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
|
||||
cd build && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -404,6 +426,7 @@ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
|
||||
@@ -27,4 +27,5 @@ tracing-subscriber.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -34,22 +34,24 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
use url::Url;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
@@ -62,7 +64,7 @@ fn main() -> Result<()> {
|
||||
let connstr = matches
|
||||
.get_one::<String>("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.get_one::<String>("spec");
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
@@ -71,40 +73,106 @@ fn main() -> Result<()> {
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec: ComputeSpec = match spec {
|
||||
let mut spec = Default::default();
|
||||
let mut spec_set = false;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
Some(json) => {
|
||||
spec = serde_json::from_str(json)?;
|
||||
spec_set = true;
|
||||
}
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
spec = serde_json::from_reader(file)?;
|
||||
spec_set = true;
|
||||
} else if let Some(id) = compute_id {
|
||||
if let Some(cp_base) = control_plane_uri {
|
||||
let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
|
||||
reqwest::blocking::Client::new()
|
||||
.get(cp_uri)
|
||||
.header("Authorization", jwt)
|
||||
.send()?
|
||||
.json()?
|
||||
live_config_allowed = true;
|
||||
if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
|
||||
spec = s;
|
||||
spec_set = true;
|
||||
}
|
||||
} else {
|
||||
panic!(
|
||||
"must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
|
||||
control_plane_uri, compute_id
|
||||
);
|
||||
panic!("must specify both --control-plane-uri and --compute-id or none");
|
||||
}
|
||||
} else {
|
||||
panic!("compute spec should be provided via --spec or --spec-path argument");
|
||||
panic!(
|
||||
"compute spec should be provided by one of the following ways: \
|
||||
--spec OR --spec-path OR --control-plane-uri and --compute-id"
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
if spec_set {
|
||||
new_state.spec = spec;
|
||||
}
|
||||
let compute_node = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// Launch http service first, so we were able to serve control-plane
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got spec, continue configuration");
|
||||
// Spec is already set by the http server handler.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We got all we need, fill in the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
let pageserver_connstr = state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.pageserver_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let storage_auth_token = state.spec.storage_auth_token.clone();
|
||||
let tenant = state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.expect("tenant id should be provided");
|
||||
let startup_tracing_context = state.spec.startup_tracing_context.clone();
|
||||
|
||||
state.pageserver_connstr = pageserver_connstr;
|
||||
state.storage_auth_token = storage_auth_token;
|
||||
state.tenant = tenant;
|
||||
state.timeline = timeline;
|
||||
state.status = ComputeStatus::Init;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the spec, and
|
||||
// attach it to the current tracing context.
|
||||
//
|
||||
@@ -120,7 +188,7 @@ fn main() -> Result<()> {
|
||||
// postgres is configured and up-and-running, we exit this span. Any other
|
||||
// actions that are performed on incoming HTTP requests, for example, are
|
||||
// performed in separate spans.
|
||||
let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
|
||||
let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
Some(TraceContextPropagator::new().extract(carrier).attach())
|
||||
@@ -128,41 +196,7 @@ fn main() -> Result<()> {
|
||||
None
|
||||
};
|
||||
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.pageserver_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.tenant_id")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("neon.timeline_id")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
let compute_state = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
storage_auth_token,
|
||||
metrics: ComputeMetrics::default(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
let compute = Arc::new(compute_state);
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
|
||||
// Start Postgres
|
||||
@@ -172,7 +206,7 @@ fn main() -> Result<()> {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
let mut state = compute.state.write().unwrap();
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
drop(state);
|
||||
@@ -262,7 +296,7 @@ fn cli() -> clap::Command {
|
||||
Arg::new("control-plane-uri")
|
||||
.short('p')
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE"),
|
||||
.value_name("CONTROL_PLANE_API_BASE_URI"),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -19,16 +19,17 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{Condvar, Mutex};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
use crate::checker::create_writability_check_data;
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
@@ -41,41 +42,54 @@ pub struct ComputeNode {
|
||||
pub connstr: url::Url,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
/// - we start compute with some spec provided as argument
|
||||
/// - we push new spec and it does reconfiguration
|
||||
/// - but then something happens and compute pod / VM is destroyed,
|
||||
/// so k8s controller starts it again with the **old** spec
|
||||
/// and the same for empty computes:
|
||||
/// - we started compute without any spec
|
||||
/// - we push spec and it does configuration
|
||||
/// - but then it is restarted without any spec again
|
||||
pub live_config_allowed: bool,
|
||||
/// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
|
||||
/// To allow HTTP API server to serving status requests, while configuration
|
||||
/// is in progress, lock should be held only for short periods of time to do
|
||||
/// read/write, not the whole configuration process.
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
/// is in progress.
|
||||
pub state: RwLock<ComputeState>,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
x.to_rfc3339().serialize(s)
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl ComputeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
status: ComputeStatus::Init,
|
||||
status: ComputeStatus::Empty,
|
||||
last_active: Utc::now(),
|
||||
error: None,
|
||||
spec: ComputeSpec::default(),
|
||||
tenant: String::new(),
|
||||
timeline: String::new(),
|
||||
pageserver_connstr: String::new(),
|
||||
storage_auth_token: None,
|
||||
metrics: ComputeMetrics::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -86,29 +100,15 @@ impl Default for ComputeState {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
Init,
|
||||
Running,
|
||||
Failed,
|
||||
}
|
||||
|
||||
#[derive(Default, Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
pub sync_safekeepers_ms: AtomicU64,
|
||||
pub basebackup_ms: AtomicU64,
|
||||
pub config_ms: AtomicU64,
|
||||
pub total_startup_ms: AtomicU64,
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
self.state.write().unwrap().status = status;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.status = status;
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.read().unwrap().status
|
||||
self.state.lock().unwrap().status
|
||||
}
|
||||
|
||||
// Remove `pgdata` directory and create it again with right permissions.
|
||||
@@ -124,15 +124,15 @@ impl ComputeNode {
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
#[instrument(skip(self))]
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
#[instrument(skip(self, compute_state))]
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
|
||||
let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
if let Some(storage_auth_token) = &compute_state.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token);
|
||||
} else {
|
||||
@@ -141,8 +141,14 @@ impl ComputeNode {
|
||||
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
"0/0" => format!(
|
||||
"basebackup {} {}",
|
||||
&compute_state.tenant, &compute_state.timeline
|
||||
), // First start of the compute
|
||||
_ => format!(
|
||||
"basebackup {} {} {}",
|
||||
&compute_state.tenant, &compute_state.timeline, lsn
|
||||
),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
|
||||
@@ -155,28 +161,24 @@ impl ComputeNode {
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
self.metrics.basebackup_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
#[instrument(skip(self))]
|
||||
fn sync_safekeepers(&self) -> Result<String> {
|
||||
#[instrument(skip(self, storage_auth_token))]
|
||||
fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<String> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
@@ -201,14 +203,11 @@ impl ComputeNode {
|
||||
);
|
||||
}
|
||||
|
||||
self.metrics.sync_safekeepers_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
@@ -217,9 +216,9 @@ impl ComputeNode {
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip(self))]
|
||||
pub fn prepare_pgdata(&self) -> Result<()> {
|
||||
let spec = &self.spec;
|
||||
#[instrument(skip(self, compute_state))]
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
let spec = &compute_state.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
@@ -228,18 +227,18 @@ impl ComputeNode {
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers()
|
||||
.sync_safekeepers(compute_state.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
lsn, &compute_state.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(&lsn).with_context(|| {
|
||||
self.get_basebackup(compute_state, &lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
lsn, &compute_state.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
|
||||
@@ -252,13 +251,16 @@ impl ComputeNode {
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
#[instrument(skip(self))]
|
||||
pub fn start_postgres(&self) -> Result<std::process::Child> {
|
||||
pub fn start_postgres(
|
||||
&self,
|
||||
storage_auth_token: Option<String>,
|
||||
) -> Result<std::process::Child> {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", &self.pgdata])
|
||||
.envs(if let Some(storage_auth_token) = &self.storage_auth_token {
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
|
||||
} else {
|
||||
vec![]
|
||||
@@ -271,8 +273,9 @@ impl ComputeNode {
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub fn apply_config(&self) -> Result<()> {
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
// If connection fails,
|
||||
// it may be the old node with `zenith_admin` superuser.
|
||||
//
|
||||
@@ -303,19 +306,19 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_role_deletions(self, &mut client)?;
|
||||
handle_grants(self, &mut client)?;
|
||||
handle_roles(&compute_state.spec, &mut client)?;
|
||||
handle_databases(&compute_state.spec, &mut client)?;
|
||||
handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?;
|
||||
create_writability_check_data(&mut client)?;
|
||||
handle_extensions(&self.spec, &mut client)?;
|
||||
handle_extensions(&compute_state.spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
self.spec.cluster.cluster_id
|
||||
compute_state.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -323,40 +326,37 @@ impl ComputeNode {
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
self.spec.cluster.cluster_id,
|
||||
self.spec.operation_uuid.as_ref().unwrap(),
|
||||
self.tenant,
|
||||
self.timeline,
|
||||
compute_state.spec.cluster.cluster_id,
|
||||
compute_state.spec.operation_uuid.as_ref().unwrap(),
|
||||
compute_state.tenant,
|
||||
compute_state.timeline,
|
||||
);
|
||||
|
||||
self.prepare_pgdata()?;
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pg = self.start_postgres()?;
|
||||
let pg = self.start_postgres(compute_state.storage_auth_token.clone())?;
|
||||
|
||||
self.apply_config()?;
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
let startup_end_time = Utc::now();
|
||||
self.metrics.config_ms.store(
|
||||
startup_end_time
|
||||
{
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.config_ms = startup_end_time
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
self.metrics.total_startup_ms.store(
|
||||
startup_end_time
|
||||
.as_millis() as u64;
|
||||
state.metrics.total_startup_ms = startup_end_time
|
||||
.signed_duration_since(self.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
.as_millis() as u64;
|
||||
}
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
Ok(pg)
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use crate::spec::ComputeSpec;
|
||||
use compute_api::spec::ComputeSpec;
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
|
||||
@@ -3,15 +3,29 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::compute::{ComputeNode, ComputeState};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tokio::task;
|
||||
use tracing::{error, info};
|
||||
use tracing_utils::http::OtelName;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
ComputeStatusResponse {
|
||||
tenant: state.tenant.clone(),
|
||||
timeline: state.timeline.clone(),
|
||||
status: state.status,
|
||||
last_active: state.last_active,
|
||||
error: state.error.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||
//
|
||||
@@ -23,26 +37,45 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
|
||||
let state = compute.state.lock().unwrap();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
// future use for Prometheus metrics format.
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
let metrics = compute.state.lock().unwrap().metrics.clone();
|
||||
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Collect Postgres current usage insights
|
||||
(&Method::GET, "/insights") => {
|
||||
info!("serving /insights GET request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!("compute is not running, current status: {:?}", status);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let insights = compute.collect_insights().await;
|
||||
Response::new(Body::from(insights))
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for check_writability request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let res = crate::checker::check_writability(compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
@@ -61,6 +94,23 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
))
|
||||
}
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If
|
||||
// anything goes wrong after we set the compute status to `ConfigurationPending`
|
||||
// and update compute state with new spec, we basically leave compute
|
||||
// in the potentially wrong state. That said, it's control-plane's
|
||||
// responsibility to watch compute state after reconfiguration request
|
||||
// and to clean restart in case of errors.
|
||||
(&Method::POST, "/configure") => {
|
||||
info!("serving /configure POST request");
|
||||
match handle_configure_request(req, compute).await {
|
||||
Ok(msg) => Response::new(Body::from(msg)),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /configure request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
@@ -70,6 +120,88 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_configure_request(
|
||||
req: Request<Body>,
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<String, (String, StatusCode)> {
|
||||
if !compute.live_config_allowed {
|
||||
return Err((
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
));
|
||||
}
|
||||
|
||||
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
|
||||
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
|
||||
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
|
||||
let spec = request.spec;
|
||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||
// we will try to `Send` `mut state` into the spawned thread
|
||||
// bellow, which will cause error:
|
||||
// ```
|
||||
// error: future cannot be sent between threads safely
|
||||
// ```
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status != ComputeStatus::Empty {
|
||||
let msg = format!(
|
||||
"invalid compute status for configuration request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.spec = spec;
|
||||
state.status = ComputeStatus::ConfigurationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
info!("set new spec and notified waiters");
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Running {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become Running, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
|
||||
if state.status == ComputeStatus::Failed {
|
||||
let err = state.error.as_ref().map_or("unknown error", |x| x);
|
||||
let msg = format!("compute configuration failed: {:?}", err);
|
||||
return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
// Return current compute state if everything went well.
|
||||
let state = compute.state.lock().unwrap().clone();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Ok(serde_json::to_string(&status_response).unwrap())
|
||||
} else {
|
||||
Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
|
||||
}
|
||||
}
|
||||
|
||||
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
let error = GenericAPIError {
|
||||
error: e.to_string(),
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<ComputeNode>) {
|
||||
|
||||
@@ -11,7 +11,7 @@ paths:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get compute node internal status
|
||||
summary: Get compute node internal status.
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
@@ -26,7 +26,7 @@ paths:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
summary: Get compute node startup metrics in JSON format.
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
@@ -41,9 +41,9 @@ paths:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get current compute insights in JSON format
|
||||
summary: Get current compute insights in JSON format.
|
||||
description: |
|
||||
Note, that this doesn't include any historical data
|
||||
Note, that this doesn't include any historical data.
|
||||
operationId: getComputeInsights
|
||||
responses:
|
||||
200:
|
||||
@@ -56,12 +56,12 @@ paths:
|
||||
/info:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get info about the compute Pod/VM
|
||||
- Info
|
||||
summary: Get info about the compute pod / VM.
|
||||
description: ""
|
||||
operationId: getInfo
|
||||
responses:
|
||||
"200":
|
||||
200:
|
||||
description: Info
|
||||
content:
|
||||
application/json:
|
||||
@@ -72,7 +72,7 @@ paths:
|
||||
post:
|
||||
tags:
|
||||
- Check
|
||||
summary: Check that we can write new data on this compute
|
||||
summary: Check that we can write new data on this compute.
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
@@ -82,9 +82,64 @@ paths:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
description: Error text or 'true' if check passed.
|
||||
example: "true"
|
||||
|
||||
/configure:
|
||||
post:
|
||||
tags:
|
||||
- Configure
|
||||
summary: Perform compute node configuration.
|
||||
description: |
|
||||
This is a blocking API endpoint, i.e. it blocks waiting until
|
||||
compute is finished configuration and is in `Running` state.
|
||||
Optional non-blocking mode could be added later.
|
||||
operationId: configureCompute
|
||||
requestBody:
|
||||
description: Configuration request.
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required:
|
||||
- spec
|
||||
properties:
|
||||
spec:
|
||||
# XXX: I don't want to explain current spec in the OpenAPI format,
|
||||
# as it could be changed really soon. Consider doing it later.
|
||||
type: object
|
||||
responses:
|
||||
200:
|
||||
description: Compute configuration finished.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeState"
|
||||
400:
|
||||
description: Provided spec is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
412:
|
||||
description: |
|
||||
It's not possible to do live-configuration of the compute.
|
||||
It's either in the wrong state, or compute doesn't use pull
|
||||
mode of configuration.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: |
|
||||
Compute configuration request was processed, but error
|
||||
occurred. Compute will likely shutdown soon.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -95,7 +150,7 @@ components:
|
||||
schemas:
|
||||
ComputeMetrics:
|
||||
type: object
|
||||
description: Compute startup metrics
|
||||
description: Compute startup metrics.
|
||||
required:
|
||||
- sync_safekeepers_ms
|
||||
- basebackup_ms
|
||||
@@ -113,7 +168,7 @@ components:
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod
|
||||
description: Information about VM/Pod.
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
@@ -130,17 +185,26 @@ components:
|
||||
$ref: '#/components/schemas/ComputeStatus'
|
||||
last_active:
|
||||
type: string
|
||||
description: The last detected compute activity timestamp in UTC and RFC3339 format
|
||||
description: The last detected compute activity timestamp in UTC and RFC3339 format.
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
error:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
description: Text of the error during compute startup, if any.
|
||||
example: ""
|
||||
tenant:
|
||||
type: string
|
||||
description: Identifier of the current tenant served by compute node, if any.
|
||||
example: c9269c359e9a199fad1ea0981246a78f
|
||||
timeline:
|
||||
type: string
|
||||
description: Identifier of the current timeline served by compute node, if any.
|
||||
example: ece7de74d4b8cbe5433a68ce4d1b97b4
|
||||
|
||||
ComputeInsights:
|
||||
type: object
|
||||
properties:
|
||||
pg_stat_statements:
|
||||
description: Contains raw output from pg_stat_statements in JSON format
|
||||
description: Contains raw output from pg_stat_statements in JSON format.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
@@ -151,6 +215,19 @@ components:
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
example: running
|
||||
|
||||
#
|
||||
# Errors
|
||||
#
|
||||
|
||||
GenericError:
|
||||
type: object
|
||||
required:
|
||||
- error
|
||||
properties:
|
||||
error:
|
||||
type: string
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
|
||||
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
let mut last_active = compute.state.read().unwrap().last_active;
|
||||
let mut last_active = compute.state.lock().unwrap().last_active;
|
||||
|
||||
if let Ok(backs) = backends {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
@@ -87,7 +87,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.write().unwrap();
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {}", last_active);
|
||||
|
||||
@@ -10,43 +10,12 @@ use std::time::{Duration, Instant};
|
||||
use anyhow::{bail, Result};
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use serde::Deserialize;
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Rust representation of Postgres role info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
/// Rust representation of Postgres database info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Database {
|
||||
pub name: PgIdent,
|
||||
pub owner: PgIdent,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
/// Common type representing both SQL statement params with or without value,
|
||||
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
|
||||
/// options like `wal_level = logical`.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct GenericOption {
|
||||
pub name: String,
|
||||
pub value: Option<String>,
|
||||
pub vartype: String,
|
||||
}
|
||||
|
||||
/// Optional collection of `GenericOption`'s. Type alias allows us to
|
||||
/// declare a `trait` on it.
|
||||
pub type GenericOptions = Option<Vec<GenericOption>>;
|
||||
|
||||
/// Escape a string for including it in a SQL literal
|
||||
fn escape_literal(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
@@ -58,9 +27,14 @@ fn escape_conf_value(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
impl GenericOption {
|
||||
trait GenericOptionExt {
|
||||
fn to_pg_option(&self) -> String;
|
||||
fn to_pg_setting(&self) -> String;
|
||||
}
|
||||
|
||||
impl GenericOptionExt for GenericOption {
|
||||
/// Represent `GenericOption` as SQL statement parameter.
|
||||
pub fn to_pg_option(&self) -> String {
|
||||
fn to_pg_option(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} '{}'", self.name, escape_literal(val)),
|
||||
@@ -72,7 +46,7 @@ impl GenericOption {
|
||||
}
|
||||
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
||||
@@ -131,10 +105,14 @@ impl GenericOptionsSearch for GenericOptions {
|
||||
}
|
||||
}
|
||||
|
||||
impl Role {
|
||||
pub trait RoleExt {
|
||||
fn to_pg_options(&self) -> String;
|
||||
}
|
||||
|
||||
impl RoleExt for Role {
|
||||
/// Serialize a list of role parameters into a Postgres-acceptable
|
||||
/// string of arguments.
|
||||
pub fn to_pg_options(&self) -> String {
|
||||
fn to_pg_options(&self) -> String {
|
||||
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
|
||||
// For now, we do not use generic `options` for roles. Once used, add
|
||||
// `self.options.as_pg_options()` somewhere here.
|
||||
@@ -159,21 +137,17 @@ impl Role {
|
||||
}
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn new(name: PgIdent, owner: PgIdent) -> Self {
|
||||
Self {
|
||||
name,
|
||||
owner,
|
||||
options: None,
|
||||
}
|
||||
}
|
||||
pub trait DatabaseExt {
|
||||
fn to_pg_options(&self) -> String;
|
||||
}
|
||||
|
||||
impl DatabaseExt for Database {
|
||||
/// Serialize a list of database parameters into a Postgres-acceptable
|
||||
/// string of arguments.
|
||||
/// NB: `TEMPLATE` is actually also an identifier, but so far we only need
|
||||
/// to use `template0` and `template1`, so it is not a problem. Yet in the future
|
||||
/// it may require a proper quoting too.
|
||||
pub fn to_pg_options(&self) -> String {
|
||||
fn to_pg_options(&self) -> String {
|
||||
let mut params: String = self.options.as_pg_options();
|
||||
write!(params, " OWNER {}", &self.owner.pg_quote())
|
||||
.expect("String is documented to not to error during write operations");
|
||||
@@ -182,10 +156,6 @@ impl Database {
|
||||
}
|
||||
}
|
||||
|
||||
/// String type alias representing Postgres identifier and
|
||||
/// intended to be used for DB / role names.
|
||||
pub type PgIdent = String;
|
||||
|
||||
/// Generic trait used to provide quoting / encoding for strings used in the
|
||||
/// Postgres SQL queries and DATABASE_URL.
|
||||
pub trait Escaping {
|
||||
@@ -226,7 +196,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| Database::new(row.get("datname"), row.get("owner")))
|
||||
.map(|row| Database {
|
||||
name: row.get("datname"),
|
||||
owner: row.get("owner"),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(postgres_dbs)
|
||||
|
||||
@@ -1,57 +1,38 @@
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres::config::Config;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::Deserialize;
|
||||
use tracing::{info, info_span, instrument, span_enabled, warn, Level};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::config;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
|
||||
|
||||
pub storage_auth_token: Option<String>,
|
||||
/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
|
||||
/// env variable is set, it will be used for authorization.
|
||||
pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
|
||||
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
};
|
||||
info!("getting spec from control plane: {}", cp_uri);
|
||||
|
||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
||||
}
|
||||
// TODO: check the response. We should distinguish cases when it's
|
||||
// - network error, then retry
|
||||
// - no spec for compute yet, then wait
|
||||
// - compute id is unknown or any other error, then bail out
|
||||
let spec = reqwest::blocking::Client::new()
|
||||
.get(cp_uri)
|
||||
.header("Authorization", jwt)
|
||||
.send()?
|
||||
.json()?;
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
Ok(spec)
|
||||
}
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
@@ -226,8 +207,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
/// Reassign all dependent objects and delete requested roles.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
if let Some(ops) = &node.spec.delta_operations {
|
||||
pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
// First, reassign all dependent objects to db owners.
|
||||
info!("reassigning dependent objects of to-be-deleted roles");
|
||||
|
||||
@@ -244,7 +225,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
// Check that role is still present in Postgres, as this could be a
|
||||
// restart with the same spec after role deletion.
|
||||
if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
|
||||
reassign_owned_objects(node, &op.name)?;
|
||||
reassign_owned_objects(spec, connstr, &op.name)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,10 +249,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &node.spec.cluster.databases {
|
||||
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
@@ -416,9 +397,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
let spec = &node.spec;
|
||||
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
// We now have a separate `web_access` role to connect to the database
|
||||
@@ -450,8 +429,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
for db in &node.spec.cluster.databases {
|
||||
let mut conf = Config::from_str(node.connstr.as_str())?;
|
||||
for db in &spec.cluster.databases {
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
let mut db_client = conf.connect(NoTls)?;
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#[cfg(test)]
|
||||
mod pg_helpers_tests {
|
||||
|
||||
use std::fs::File;
|
||||
|
||||
use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent};
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::ComputeSpec;
|
||||
|
||||
#[test]
|
||||
fn params_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
@@ -23,7 +22,7 @@ mod pg_helpers_tests {
|
||||
|
||||
#[test]
|
||||
fn settings_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
|
||||
14
libs/compute_api/Cargo.toml
Normal file
14
libs/compute_api/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
3
libs/compute_api/src/lib.rs
Normal file
3
libs/compute_api/src/lib.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
pub mod requests;
|
||||
pub mod responses;
|
||||
pub mod spec;
|
||||
14
libs/compute_api/src/requests.rs
Normal file
14
libs/compute_api/src/requests.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
|
||||
use crate::spec::ComputeSpec;
|
||||
use serde::Deserialize;
|
||||
|
||||
/// Request of the /configure API
|
||||
///
|
||||
/// We now pass only `spec` in the configuration request, but later we can
|
||||
/// extend it and something like `restart: bool` or something else. So put
|
||||
/// `spec` into a struct initially to be more flexible in the future.
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ConfigurationRequest {
|
||||
pub spec: ComputeSpec,
|
||||
}
|
||||
66
libs/compute_api/src/responses.rs
Normal file
66
libs/compute_api/src/responses.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeStatusResponse {
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub status: ComputeStatus,
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
// Spec wasn't provided at start, waiting for it to be
|
||||
// provided by control-plane.
|
||||
Empty,
|
||||
// Compute configuration was requested.
|
||||
ConfigurationPending,
|
||||
// Compute node has spec and initial startup and
|
||||
// configuration is in progress.
|
||||
Init,
|
||||
// Compute is configured and running.
|
||||
Running,
|
||||
// Either startup or configuration failed,
|
||||
// compute will exit soon or is waiting for
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
x.to_rfc3339().serialize(s)
|
||||
}
|
||||
|
||||
/// Response of the /metrics.json API
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub config_ms: u64,
|
||||
pub total_startup_ms: u64,
|
||||
}
|
||||
94
libs/compute_api/src/spec.rs
Normal file
94
libs/compute_api/src/spec.rs
Normal file
@@ -0,0 +1,94 @@
|
||||
//! `ComputeSpec` represents the contents of the spec.json file.
|
||||
//!
|
||||
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
|
||||
//! all the information needed to start up the right version of PostgreSQL,
|
||||
//! and connect it to the storage nodes.
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// String type alias representing Postgres identifier and
|
||||
/// intended to be used for DB / role names.
|
||||
pub type PgIdent = String;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Debug, Default, Deserialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub startup_tracing_context: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
|
||||
/// Rust representation of Postgres role info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
/// Rust representation of Postgres database info with only those fields
|
||||
/// that matter for us.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct Database {
|
||||
pub name: PgIdent,
|
||||
pub owner: PgIdent,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
/// Common type representing both SQL statement params with or without value,
|
||||
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
|
||||
/// options like `wal_level = logical`.
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct GenericOption {
|
||||
pub name: String,
|
||||
pub value: Option<String>,
|
||||
pub vartype: String,
|
||||
}
|
||||
|
||||
/// Optional collection of `GenericOption`'s. Type alias allows us to
|
||||
/// declare a `trait` on it.
|
||||
pub type GenericOptions = Option<Vec<GenericOption>>;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
}
|
||||
}
|
||||
@@ -54,7 +54,7 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
|
||||
use io::ErrorKind::*;
|
||||
matches!(
|
||||
e.kind(),
|
||||
ConnectionRefused | ConnectionAborted | ConnectionReset
|
||||
ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut
|
||||
)
|
||||
}
|
||||
|
||||
@@ -320,9 +320,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
if let ProtoState::Closed = self.state {
|
||||
Ok(None)
|
||||
} else {
|
||||
let m = self.framed.read_message().await?;
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
match self.framed.read_message().await {
|
||||
Ok(m) => {
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
}
|
||||
Err(e) => {
|
||||
// remember not to try to read anymore
|
||||
self.state = ProtoState::Closed;
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -493,7 +501,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
MaybeWriteOnly::Full(framed) => {
|
||||
let (reader, writer) = framed.split();
|
||||
self.framed = MaybeWriteOnly::WriteOnly(writer);
|
||||
Ok(PostgresBackendReader(reader))
|
||||
Ok(PostgresBackendReader {
|
||||
reader,
|
||||
closed: false,
|
||||
})
|
||||
}
|
||||
MaybeWriteOnly::WriteOnly(_) => {
|
||||
anyhow::bail!("PostgresBackend is already split")
|
||||
@@ -510,8 +521,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
anyhow::bail!("PostgresBackend is not split")
|
||||
}
|
||||
MaybeWriteOnly::WriteOnly(writer) => {
|
||||
let joined = Framed::unsplit(reader.0, writer);
|
||||
let joined = Framed::unsplit(reader.reader, writer);
|
||||
self.framed = MaybeWriteOnly::Full(joined);
|
||||
// if reader encountered connection error, do not attempt reading anymore
|
||||
if reader.closed {
|
||||
self.state = ProtoState::Closed;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"),
|
||||
@@ -797,15 +812,25 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackendReader<IO>(FramedReader<MaybeTlsStream<IO>>);
|
||||
pub struct PostgresBackendReader<IO> {
|
||||
reader: FramedReader<MaybeTlsStream<IO>>,
|
||||
closed: bool, // true if received error closing the connection
|
||||
}
|
||||
|
||||
impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
|
||||
/// Read full message or return None if connection is cleanly closed with no
|
||||
/// unprocessed data.
|
||||
pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
|
||||
let m = self.0.read_message().await?;
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
match self.reader.read_message().await {
|
||||
Ok(m) => {
|
||||
trace!("read msg {:?}", m);
|
||||
Ok(m)
|
||||
}
|
||||
Err(e) => {
|
||||
self.closed = true;
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get CopyData contents of the next message in COPY stream or error
|
||||
@@ -923,7 +948,7 @@ pub enum CopyStreamHandlerEnd {
|
||||
#[error("EOF on COPY stream")]
|
||||
EOF,
|
||||
/// The connection was lost
|
||||
#[error(transparent)]
|
||||
#[error("connection error: {0}")]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
|
||||
@@ -293,6 +293,9 @@ impl FeStartupPacket {
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// which is less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid startup packet message length {}",
|
||||
|
||||
@@ -13,7 +13,6 @@ use std::{
|
||||
collections::HashMap,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
ops::Deref,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
@@ -90,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
@@ -161,14 +160,67 @@ pub enum GenericRemoteStorage {
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
}
|
||||
|
||||
impl Deref for GenericRemoteStorage {
|
||||
type Target = dyn RemoteStorage;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
impl GenericRemoteStorage {
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
GenericRemoteStorage::LocalFs(local_fs) => local_fs,
|
||||
GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
|
||||
GenericRemoteStorage::Unreliable(s) => s.as_ref(),
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.download(from).await,
|
||||
Self::AwsS3(s) => s.download(from).await,
|
||||
Self::Unreliable(s) => s.download(from).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete(path).await,
|
||||
Self::AwsS3(s) => s.delete(path).await,
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -199,7 +251,7 @@ impl GenericRemoteStorage {
|
||||
/// this path is used for the remote object id conversion only.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -118,7 +118,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
|
||||
@@ -343,7 +343,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
|
||||
@@ -84,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
|
||||
@@ -11,6 +11,14 @@ use serde::{Deserialize, Serialize};
|
||||
pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
|
||||
|
||||
impl Percent {
|
||||
pub const fn new(pct: u8) -> Option<Self> {
|
||||
if pct <= 100 {
|
||||
Some(Percent(pct))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
@@ -639,7 +639,7 @@ mod filesystem_level_usage {
|
||||
),
|
||||
(
|
||||
"max_usage_pct",
|
||||
usage_pct > self.config.max_usage_pct.get() as u64,
|
||||
usage_pct >= self.config.max_usage_pct.get() as u64,
|
||||
),
|
||||
];
|
||||
|
||||
@@ -686,4 +686,43 @@ mod filesystem_level_usage {
|
||||
avail_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_usage_pct_pressure() {
|
||||
use super::Usage as _;
|
||||
use std::time::Duration;
|
||||
use utils::serde_percent::Percent;
|
||||
|
||||
let mut usage = Usage {
|
||||
config: &DiskUsageEvictionTaskConfig {
|
||||
max_usage_pct: Percent::new(85).unwrap(),
|
||||
min_avail_bytes: 0,
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
};
|
||||
|
||||
assert!(usage.has_pressure(), "expected pressure at 100%");
|
||||
|
||||
usage.add_available_bytes(14_000);
|
||||
assert!(usage.has_pressure(), "expected pressure at 86%");
|
||||
|
||||
usage.add_available_bytes(999);
|
||||
assert!(usage.has_pressure(), "expected pressure at 85.001%");
|
||||
|
||||
usage.add_available_bytes(1);
|
||||
assert!(usage.has_pressure(), "expected pressure at precisely 85%");
|
||||
|
||||
usage.add_available_bytes(1);
|
||||
assert!(!usage.has_pressure(), "no pressure at 84.999%");
|
||||
|
||||
usage.add_available_bytes(999);
|
||||
assert!(!usage.has_pressure(), "no pressure at 84%");
|
||||
|
||||
usage.add_available_bytes(16_000);
|
||||
assert!(!usage.has_pressure());
|
||||
}
|
||||
}
|
||||
|
||||
38
poetry.lock
generated
38
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "allure-pytest"
|
||||
version = "2.10.0"
|
||||
version = "2.13.1"
|
||||
description = "Allure pytest integration"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
|
||||
{file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
|
||||
{file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
|
||||
{file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
allure-python-commons = "2.10.0"
|
||||
allure-python-commons = "2.13.1"
|
||||
pytest = ">=4.5.0"
|
||||
six = ">=1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "allure-python-commons"
|
||||
version = "2.10.0"
|
||||
version = "2.13.1"
|
||||
description = "Common module for integrate allure with python-based frameworks"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
|
||||
{file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
|
||||
{file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
|
||||
{file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
attrs = ">=16.0.0"
|
||||
pluggy = ">=0.4.0"
|
||||
six = ">=1.9.0"
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
@@ -1932,6 +1930,22 @@ pytest = [
|
||||
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-rerunfailures"
|
||||
version = "11.1.2"
|
||||
description = "pytest plugin to re-run tests to eliminate flaky failures"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
|
||||
{file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
packaging = ">=17.1"
|
||||
pytest = ">=5.3"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-timeout"
|
||||
version = "2.1.0"
|
||||
@@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
|
||||
content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
|
||||
|
||||
@@ -53,7 +53,7 @@ pub async fn password_hack(
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.project, "received missing parameter");
|
||||
creds.project = Some(payload.project.into());
|
||||
creds.project = Some(payload.project);
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
node.config.password(payload.password);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
use crate::error::UserFacingError;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use thiserror::Error;
|
||||
use tracing::info;
|
||||
|
||||
@@ -19,11 +19,10 @@ pub enum ClientCredsParseError {
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
|
||||
#[error(
|
||||
"SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
|
||||
SNI should be formatted as '<project-name>.{}'.",
|
||||
.sni, .cn, .cn,
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
.cn,
|
||||
)]
|
||||
InconsistentSni { sni: String, cn: String },
|
||||
UnknownCommonName { cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(String),
|
||||
@@ -37,7 +36,7 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
pub project: Option<String>,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
@@ -51,7 +50,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
pub fn parse(
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_name: Option<&str>,
|
||||
common_names: Option<HashSet<String>>,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
@@ -60,37 +59,43 @@ impl<'a> ClientCredentials<'a> {
|
||||
let user = get_param("user")?;
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params.options_raw().and_then(|mut options| {
|
||||
options
|
||||
.find_map(|opt| opt.strip_prefix("project="))
|
||||
.map(Cow::Borrowed)
|
||||
});
|
||||
let project_option = params
|
||||
.options_raw()
|
||||
.and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
|
||||
.map(|name| name.to_string());
|
||||
|
||||
// Alternative project name is in fact a subdomain from SNI.
|
||||
// NOTE: we do not consider SNI if `common_name` is missing.
|
||||
let project_domain = sni
|
||||
.zip(common_name)
|
||||
.map(|(sni, cn)| {
|
||||
subdomain_from_sni(sni, cn)
|
||||
.ok_or_else(|| InconsistentSni {
|
||||
sni: sni.into(),
|
||||
cn: cn.into(),
|
||||
let project_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain);
|
||||
|
||||
let project = common_name_from_sni
|
||||
.and_then(|domain| {
|
||||
if cn.contains(domain) {
|
||||
subdomain_from_sni(sni_str, domain)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(Cow::<'static, str>::Owned)
|
||||
})
|
||||
.transpose()?;
|
||||
.ok_or_else(|| UnknownCommonName {
|
||||
cn: common_name_from_sni.unwrap_or("").into(),
|
||||
})?;
|
||||
|
||||
let project = match (project_option, project_domain) {
|
||||
Some(project)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let project = match (project_option, project_from_domain) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(option), Some(domain)) if option != domain => {
|
||||
Some(Err(InconsistentProjectNames {
|
||||
domain: domain.into(),
|
||||
option: option.into(),
|
||||
}))
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
|
||||
false => Err(MalformedProjectName(name.into())),
|
||||
false => Err(MalformedProjectName(name)),
|
||||
true => Ok(name),
|
||||
}),
|
||||
}
|
||||
@@ -149,9 +154,9 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("foo.localhost");
|
||||
let common_name = Some("localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
|
||||
@@ -177,24 +182,41 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
|
||||
|
||||
let sni = Some("baz.localhost");
|
||||
let common_name = Some("localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_multi_common_names() -> anyhow::Result<()> {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_projects_different() {
|
||||
let options =
|
||||
StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
|
||||
|
||||
let sni = Some("second.localhost");
|
||||
let common_name = Some("localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentProjectNames { domain, option } => {
|
||||
assert_eq!(option, "first");
|
||||
@@ -209,13 +231,12 @@ mod tests {
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_name = Some("example.com");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
|
||||
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
|
||||
match err {
|
||||
InconsistentSni { sni, cn } => {
|
||||
assert_eq!(sni, "project.localhost");
|
||||
assert_eq!(cn, "example.com");
|
||||
UnknownCommonName { cn } => {
|
||||
assert_eq!(cn, "localhost");
|
||||
}
|
||||
_ => panic!("bad error: {err:?}"),
|
||||
}
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
use crate::auth;
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use std::{str::FromStr, sync::Arc, time::Duration};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use rustls::sign;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
@@ -16,7 +22,7 @@ pub struct MetricCollectionConfig {
|
||||
|
||||
pub struct TlsConfig {
|
||||
pub config: Arc<rustls::ServerConfig>,
|
||||
pub common_name: Option<String>,
|
||||
pub common_names: Option<HashSet<String>>,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
@@ -26,28 +32,34 @@ impl TlsConfig {
|
||||
}
|
||||
|
||||
/// Configure TLS for the main endpoint.
|
||||
pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
pub fn configure_tls(
|
||||
key_path: &str,
|
||||
cert_path: &str,
|
||||
certs_dir: Option<&String>,
|
||||
) -> anyhow::Result<TlsConfig> {
|
||||
let mut cert_resolver = CertResolver::new();
|
||||
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||
};
|
||||
// add default certificate
|
||||
cert_resolver.add_cert(key_path, cert_path)?;
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
// add extra certificates
|
||||
if let Some(certs_dir) = certs_dir {
|
||||
for entry in std::fs::read_dir(certs_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
// file names aligned with default cert-manager names
|
||||
let key_path = path.join("tls.key");
|
||||
let cert_path = path.join("tls.crt");
|
||||
if key_path.exists() && cert_path.exists() {
|
||||
cert_resolver
|
||||
.add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.context(format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.into_iter()
|
||||
.map(rustls::Certificate)
|
||||
.collect()
|
||||
};
|
||||
let common_names = cert_resolver.get_common_names();
|
||||
|
||||
let config = rustls::ServerConfig::builder()
|
||||
.with_safe_default_cipher_suites()
|
||||
@@ -55,27 +67,116 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
|
||||
// allow TLS 1.2 to be compatible with older client libraries
|
||||
.with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(cert_chain, key)?
|
||||
.with_cert_resolver(Arc::new(cert_resolver))
|
||||
.into();
|
||||
|
||||
// determine common name from tls-cert (-c server.crt param).
|
||||
// used in asserting project name formatting invariant.
|
||||
let common_name = {
|
||||
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||
.context(format!(
|
||||
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.1;
|
||||
let common_name = pem.parse_x509()?.subject().to_string();
|
||||
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
|
||||
};
|
||||
|
||||
Ok(TlsConfig {
|
||||
config,
|
||||
common_name,
|
||||
common_names: Some(common_names),
|
||||
})
|
||||
}
|
||||
|
||||
struct CertResolver {
|
||||
certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
|
||||
}
|
||||
|
||||
impl CertResolver {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
certs: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
|
||||
let priv_key = {
|
||||
let key_bytes = std::fs::read(key_path).context("TLS key file")?;
|
||||
let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.context(format!("Failed to read TLS keys at '{key_path}'"))?;
|
||||
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().map(rustls::PrivateKey).unwrap()
|
||||
};
|
||||
|
||||
let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
|
||||
|
||||
let cert_chain_bytes = std::fs::read(cert_path)
|
||||
.context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
|
||||
|
||||
let cert_chain = {
|
||||
rustls_pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.context(format!(
|
||||
"Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.into_iter()
|
||||
.map(rustls::Certificate)
|
||||
.collect()
|
||||
};
|
||||
|
||||
let common_name = {
|
||||
let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
|
||||
.context(format!(
|
||||
"Failed to parse PEM object from bytes from file at '{cert_path}'."
|
||||
))?
|
||||
.1;
|
||||
let common_name = pem.parse_x509()?.subject().to_string();
|
||||
|
||||
// We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
|
||||
// wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
|
||||
// verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
|
||||
// and passed None instead, which blows up number of cases downstream code should handle. Proper coding
|
||||
// here should better avoid Option for common_names, and do wildcard-based certificate selection instead
|
||||
// of cutting off '*.' parts.
|
||||
if common_name.starts_with("CN=*.") {
|
||||
common_name.strip_prefix("CN=*.").map(|s| s.to_string())
|
||||
} else {
|
||||
common_name.strip_prefix("CN=").map(|s| s.to_string())
|
||||
}
|
||||
}
|
||||
.context(format!(
|
||||
"Failed to parse common name from certificate at '{cert_path}'."
|
||||
))?;
|
||||
|
||||
self.certs.insert(
|
||||
common_name,
|
||||
Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_common_names(&self) -> HashSet<String> {
|
||||
self.certs.keys().map(|s| s.to_string()).collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl rustls::server::ResolvesServerCert for CertResolver {
|
||||
fn resolve(
|
||||
&self,
|
||||
_client_hello: rustls::server::ClientHello,
|
||||
) -> Option<Arc<rustls::sign::CertifiedKey>> {
|
||||
// loop here and cut off more and more subdomains until we find
|
||||
// a match to get a proper wildcard support. OTOH, we now do not
|
||||
// use nested domains, so keep this simple for now.
|
||||
//
|
||||
// With the current coding foo.com will match *.foo.com and that
|
||||
// repeats behavior of the old code.
|
||||
if let Some(mut sni_name) = _client_hello.server_name() {
|
||||
loop {
|
||||
if let Some(cert) = self.certs.get(sni_name) {
|
||||
return Some(cert.clone());
|
||||
}
|
||||
if let Some((_, rest)) = sni_name.split_once('.') {
|
||||
sni_name = rest;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for cmdline cache options parsing.
|
||||
pub struct CacheOptions {
|
||||
/// Max number of entries.
|
||||
|
||||
@@ -132,7 +132,11 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
|
||||
args.get_one::<String>("tls-key"),
|
||||
args.get_one::<String>("tls-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
|
||||
(Some(key_path), Some(cert_path)) => Some(config::configure_tls(
|
||||
key_path,
|
||||
cert_path,
|
||||
args.get_one::<String>("certs-dir"),
|
||||
)?),
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither tls-key and tls-cert must be specified"),
|
||||
};
|
||||
@@ -254,6 +258,12 @@ fn cli() -> clap::Command {
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
|
||||
.arg(
|
||||
Arg::new("certs-dir")
|
||||
.long("certs-dir")
|
||||
.help("path to directory with TLS certificates for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-endpoint")
|
||||
.long("metric-collection-endpoint")
|
||||
|
||||
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use tracing::{debug, error, info, instrument, trace};
|
||||
use tracing::{error, info, instrument, trace, warn};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
@@ -84,10 +84,14 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
|
||||
let value = ms.get_counter().get_value() as u64;
|
||||
|
||||
debug!(
|
||||
"branch_id {} endpoint_id {} val: {}",
|
||||
branch_id, endpoint_id, value
|
||||
);
|
||||
// Report if the metric value is suspiciously large
|
||||
if value > (1u64 << 40) {
|
||||
warn!(
|
||||
"potentially abnormal counter value: branch_id {} endpoint_id {} val: {}",
|
||||
branch_id, endpoint_id, value
|
||||
);
|
||||
}
|
||||
|
||||
current_metrics.push((
|
||||
Ids {
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
@@ -124,11 +128,15 @@ async fn collect_metrics_iteration(
|
||||
let mut value = *curr_val;
|
||||
|
||||
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
|
||||
// Only send metrics updates if the metric has changed
|
||||
if curr_val - prev_val > 0 {
|
||||
// Only send metrics updates if the metric has increased
|
||||
if curr_val > prev_val {
|
||||
value = curr_val - prev_val;
|
||||
start_time = *prev_time;
|
||||
} else {
|
||||
if curr_val < prev_val {
|
||||
error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}",
|
||||
prev_val, curr_val, curr_key);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
};
|
||||
@@ -189,7 +197,7 @@ async fn collect_metrics_iteration(
|
||||
})
|
||||
// update cached value (add delta) and time
|
||||
.and_modify(|e| {
|
||||
e.0 += send_metric.value;
|
||||
e.0 = e.0.saturating_add(send_metric.value);
|
||||
e.1 = stop_time
|
||||
})
|
||||
// cache new metric
|
||||
|
||||
@@ -124,11 +124,11 @@ pub async fn handle_ws_client(
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
@@ -163,11 +163,11 @@ async fn handle_client(
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let sni = stream.get_ref().sni_hostname();
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
|
||||
@@ -54,9 +54,11 @@ fn generate_tls_config<'a>(
|
||||
.with_single_cert(vec![cert], key)?
|
||||
.into();
|
||||
|
||||
let common_names = Some([common_name.to_owned()].iter().cloned().collect());
|
||||
|
||||
TlsConfig {
|
||||
config,
|
||||
common_name: Some(common_name.to_string()),
|
||||
common_names,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "^2.2.3"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
allure-pytest = "^2.13.1"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
toml = "^0.10.2"
|
||||
psutil = "^5.9.4"
|
||||
@@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4"
|
||||
types-toml = "^0.10.8"
|
||||
pytest-httpserver = "^1.0.6"
|
||||
aiohttp = "3.7.4"
|
||||
pytest-rerunfailures = "^11.1.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.1.0"
|
||||
@@ -69,6 +70,9 @@ strict = true
|
||||
module = [
|
||||
"asyncpg.*",
|
||||
"pg8000.*",
|
||||
"allure.*",
|
||||
"allure_commons.*",
|
||||
"allure_pytest.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -8,13 +8,7 @@
|
||||
# warnings and errors right in the editor.
|
||||
# In vscode, this setting is Rust-analyzer>Check On Save:Command
|
||||
|
||||
# manual-range-contains wants
|
||||
# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)
|
||||
# instead of
|
||||
# len < 4 || len > MAX_STARTUP_PACKET_LENGTH
|
||||
# , let's disagree.
|
||||
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings
|
||||
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
|
||||
|
||||
@@ -30,6 +30,7 @@ serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -674,7 +674,8 @@ impl Timeline {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
self.write_shared_state().sk.inmem.backup_lsn = backup_lsn;
|
||||
let mut state = self.write_shared_state();
|
||||
state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
|
||||
// we should check whether to shut down offloader, but this will be done
|
||||
// soon by peer communication anyway.
|
||||
Ok(())
|
||||
|
||||
@@ -323,7 +323,8 @@ impl WalBackupTask {
|
||||
}
|
||||
|
||||
match backup_lsn_range(
|
||||
backup_lsn,
|
||||
&self.timeline,
|
||||
&mut backup_lsn,
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
&self.timeline_dir,
|
||||
@@ -331,13 +332,7 @@ impl WalBackupTask {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(backup_lsn_result) => {
|
||||
backup_lsn = backup_lsn_result;
|
||||
let res = self.timeline.set_wal_backup_lsn(backup_lsn_result);
|
||||
if let Err(e) = res {
|
||||
error!("failed to set wal_backup_lsn: {}", e);
|
||||
return;
|
||||
}
|
||||
Ok(()) => {
|
||||
retry_attempt = 0;
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -354,20 +349,25 @@ impl WalBackupTask {
|
||||
}
|
||||
|
||||
pub async fn backup_lsn_range(
|
||||
start_lsn: Lsn,
|
||||
timeline: &Arc<Timeline>,
|
||||
backup_lsn: &mut Lsn,
|
||||
end_lsn: Lsn,
|
||||
wal_seg_size: usize,
|
||||
timeline_dir: &Path,
|
||||
workspace_dir: &Path,
|
||||
) -> Result<Lsn> {
|
||||
let mut res = start_lsn;
|
||||
) -> Result<()> {
|
||||
let start_lsn = *backup_lsn;
|
||||
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
|
||||
for s in &segments {
|
||||
backup_single_segment(s, timeline_dir, workspace_dir)
|
||||
.await
|
||||
.with_context(|| format!("offloading segno {}", s.seg_no))?;
|
||||
|
||||
res = s.end_lsn;
|
||||
let new_backup_lsn = s.end_lsn;
|
||||
timeline
|
||||
.set_wal_backup_lsn(new_backup_lsn)
|
||||
.context("setting wal_backup_lsn")?;
|
||||
*backup_lsn = new_backup_lsn;
|
||||
}
|
||||
info!(
|
||||
"offloaded segnos {:?} up to {}, previous backup_lsn {}",
|
||||
@@ -375,7 +375,7 @@ pub async fn backup_lsn_range(
|
||||
end_lsn,
|
||||
start_lsn,
|
||||
);
|
||||
Ok(res)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn backup_single_segment(
|
||||
|
||||
@@ -4,8 +4,9 @@
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::QueryError;
|
||||
use std::{future, thread};
|
||||
use std::{future, thread, time::Duration};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_io_timeout::TimeoutReader;
|
||||
use tracing::*;
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
@@ -67,41 +68,52 @@ fn handle_socket(
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let local = tokio::task::LocalSet::new();
|
||||
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
let traffic_metrics = TrafficMetrics::new();
|
||||
if let Some(current_az) = conf.availability_zone.as_deref() {
|
||||
traffic_metrics.set_sk_az(current_az);
|
||||
}
|
||||
// TimeoutReader wants async runtime during creation.
|
||||
runtime.block_on(async move {
|
||||
// Set timeout on reading from the socket. It prevents hanged up connection
|
||||
// if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
|
||||
// default, and tokio doesn't provide ability to set it out of the box.
|
||||
let mut socket = TimeoutReader::new(socket);
|
||||
let wal_service_timeout = Duration::from_secs(60 * 10);
|
||||
socket.set_timeout(Some(wal_service_timeout));
|
||||
// pin! is here because TimeoutReader (due to storing sleep future inside)
|
||||
// is not Unpin, and all pgbackend/framed/tokio dependencies require stream
|
||||
// to be Unpin. Which is reasonable, as indeed something like TimeoutReader
|
||||
// shouldn't be moved.
|
||||
tokio::pin!(socket);
|
||||
|
||||
let socket = MeasuredStream::new(
|
||||
socket,
|
||||
|cnt| {
|
||||
traffic_metrics.observe_read(cnt);
|
||||
},
|
||||
|cnt| {
|
||||
traffic_metrics.observe_write(cnt);
|
||||
},
|
||||
);
|
||||
let traffic_metrics = TrafficMetrics::new();
|
||||
if let Some(current_az) = conf.availability_zone.as_deref() {
|
||||
traffic_metrics.set_sk_az(current_az);
|
||||
}
|
||||
|
||||
let auth_type = match conf.auth {
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
local.block_on(
|
||||
&runtime,
|
||||
pgbackend.run(&mut conn_handler, future::pending::<()>),
|
||||
)?;
|
||||
let socket = MeasuredStream::new(
|
||||
socket,
|
||||
|cnt| {
|
||||
traffic_metrics.observe_read(cnt);
|
||||
},
|
||||
|cnt| {
|
||||
traffic_metrics.observe_write(cnt);
|
||||
},
|
||||
);
|
||||
|
||||
Ok(())
|
||||
let auth_type = match conf.auth {
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
pgbackend
|
||||
.run(&mut conn_handler, future::pending::<()>)
|
||||
.await
|
||||
})
|
||||
}
|
||||
|
||||
/// Unique WAL service connection ids are logged in spans for observability.
|
||||
|
||||
87
scripts/flaky_tests.py
Executable file
87
scripts/flaky_tests.py
Executable file
@@ -0,0 +1,87 @@
|
||||
#! /usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from typing import DefaultDict, Dict
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
|
||||
FLAKY_TESTS_QUERY = """
|
||||
SELECT
|
||||
DISTINCT parent_suite, suite, test
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
revision,
|
||||
jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
|
||||
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
|
||||
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
|
||||
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
|
||||
FROM
|
||||
regress_test_results
|
||||
WHERE
|
||||
reference = 'refs/heads/main'
|
||||
) data
|
||||
WHERE
|
||||
timestamp > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND status::text IN ('"failed"', '"broken"')
|
||||
;
|
||||
"""
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
connstr = args.connstr
|
||||
interval_days = args.days
|
||||
output = args.output
|
||||
|
||||
res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
|
||||
res = defaultdict(lambda: defaultdict(dict))
|
||||
|
||||
logging.info("connecting to the database...")
|
||||
with psycopg2.connect(connstr, connect_timeout=10) as conn:
|
||||
with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
||||
logging.info("fetching flaky tests...")
|
||||
cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
|
||||
rows = cur.fetchall()
|
||||
|
||||
for row in rows:
|
||||
logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
|
||||
res[row["parent_suite"]][row["suite"]][row["test"]] = True
|
||||
|
||||
logging.info(f"saving results to {output.name}")
|
||||
json.dump(res, output, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=argparse.FileType("w"),
|
||||
default="flaky.json",
|
||||
help="path to output json file (default: flaky.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--days",
|
||||
required=False,
|
||||
default=10,
|
||||
type=int,
|
||||
help="how many days to look back for flaky tests (default: 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"connstr",
|
||||
help="connection string to the test results database",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format="%(message)s",
|
||||
level=level,
|
||||
)
|
||||
|
||||
main(args)
|
||||
125
scripts/pr-comment-test-report.js
Normal file
125
scripts/pr-comment-test-report.js
Normal file
@@ -0,0 +1,125 @@
|
||||
//
|
||||
// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
|
||||
// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
|
||||
//
|
||||
// The comment is updated on each run with the latest results.
|
||||
//
|
||||
// It is designed to be used with actions/github-script from GitHub Workflows:
|
||||
// - uses: actions/github-script@v6
|
||||
// with:
|
||||
// script: |
|
||||
// const script = require("./scripts/pr-comment-test-report.js")
|
||||
// await script({
|
||||
// github,
|
||||
// context,
|
||||
// fetch,
|
||||
// reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
|
||||
// })
|
||||
//
|
||||
|
||||
module.exports = async ({ github, context, fetch, reports }) => {
|
||||
// Marker to find the comment in the subsequent runs
|
||||
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
|
||||
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
|
||||
const githubActionsBotId = 41898282
|
||||
// The latest commit in the PR URL
|
||||
const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
|
||||
// Commend body itself
|
||||
let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
|
||||
|
||||
// Common parameters for GitHub API requests
|
||||
const ownerRepoParams = {
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
}
|
||||
|
||||
for (const report of reports) {
|
||||
const {buildType, reportUrl, jsonUrl} = report
|
||||
|
||||
if (!reportUrl || !jsonUrl) {
|
||||
console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`)
|
||||
continue
|
||||
}
|
||||
|
||||
const suites = await (await fetch(jsonUrl)).json()
|
||||
|
||||
// Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
|
||||
// For this report it's ok to treat them in the same way (as failed).
|
||||
failedTests = []
|
||||
passedTests = []
|
||||
skippedTests = []
|
||||
|
||||
retriedTests = []
|
||||
retriedStatusChangedTests = []
|
||||
|
||||
for (const parentSuite of suites.children) {
|
||||
for (const suite of parentSuite.children) {
|
||||
for (const test of suite.children) {
|
||||
pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
|
||||
test.pytestName = pytestName
|
||||
|
||||
if (test.status === "passed") {
|
||||
passedTests.push(test);
|
||||
} else if (test.status === "failed" || test.status === "broken") {
|
||||
failedTests.push(test);
|
||||
} else if (test.status === "skipped") {
|
||||
skippedTests.push(test);
|
||||
}
|
||||
|
||||
if (test.retriesCount > 0) {
|
||||
retriedTests.push(test);
|
||||
|
||||
if (test.retriedStatusChangedTests) {
|
||||
retriedStatusChangedTests.push(test);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
|
||||
commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n`
|
||||
if (failedTests.length > 0) {
|
||||
commentBody += `Failed tests:\n`
|
||||
for (const test of failedTests) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
|
||||
|
||||
commentBody += `- [\`${test.pytestName}\`](${allureLink})`
|
||||
if (test.retriesCount > 0) {
|
||||
commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
if (retriedStatusChangedTests > 0) {
|
||||
commentBody += `Flaky tests:\n`
|
||||
for (const test of retriedStatusChangedTests) {
|
||||
const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
|
||||
commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
|
||||
}
|
||||
commentBody += "\n"
|
||||
}
|
||||
commentBody += "___\n"
|
||||
}
|
||||
|
||||
const { data: comments } = await github.rest.issues.listComments({
|
||||
issue_number: context.payload.number,
|
||||
...ownerRepoParams,
|
||||
})
|
||||
|
||||
const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
|
||||
if (comment) {
|
||||
await github.rest.issues.updateComment({
|
||||
comment_id: comment.id,
|
||||
body: commentBody,
|
||||
...ownerRepoParams,
|
||||
})
|
||||
} else {
|
||||
await github.rest.issues.createComment({
|
||||
issue_number: context.payload.number,
|
||||
body: commentBody,
|
||||
...ownerRepoParams,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -4,4 +4,5 @@ pytest_plugins = (
|
||||
"fixtures.pg_stats",
|
||||
"fixtures.compare_fixtures",
|
||||
"fixtures.slow",
|
||||
"fixtures.flaky",
|
||||
)
|
||||
|
||||
58
test_runner/fixtures/flaky.py
Normal file
58
test_runner/fixtures/flaky.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from _pytest.config import Config
|
||||
from _pytest.config.argparsing import Parser
|
||||
from allure_commons.types import LabelType
|
||||
from allure_pytest.utils import allure_name, allure_suite_labels
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
"""
|
||||
The plugin reruns flaky tests.
|
||||
It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
|
||||
|
||||
Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
|
||||
"""
|
||||
|
||||
|
||||
def pytest_addoption(parser: Parser):
|
||||
parser.addoption(
|
||||
"--flaky-tests-json",
|
||||
action="store",
|
||||
type=Path,
|
||||
help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
|
||||
)
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
|
||||
if not config.getoption("--flaky-tests-json"):
|
||||
return
|
||||
|
||||
# Any error with getting flaky tests aren't critical, so just do not rerun any tests
|
||||
flaky_json = config.getoption("--flaky-tests-json")
|
||||
if not flaky_json.exists():
|
||||
return
|
||||
|
||||
content = flaky_json.read_text()
|
||||
try:
|
||||
flaky_tests = json.loads(content)
|
||||
except ValueError:
|
||||
log.error(f"Can't parse {content} as json")
|
||||
return
|
||||
|
||||
for item in items:
|
||||
# Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
|
||||
# Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
|
||||
allure_labels = dict(allure_suite_labels(item))
|
||||
parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
|
||||
suite = str(allure_labels.get(LabelType.SUITE))
|
||||
params = item.callspec.params if hasattr(item, "callspec") else {}
|
||||
name = allure_name(item, params)
|
||||
|
||||
if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
|
||||
# Rerun 3 times = 1 original run + 2 reruns
|
||||
log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
|
||||
item.add_marker(pytest.mark.flaky(reruns=2))
|
||||
@@ -14,9 +14,9 @@ import tempfile
|
||||
import textwrap
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import Flag, auto
|
||||
from functools import cached_property
|
||||
from itertools import chain, product
|
||||
@@ -43,11 +43,12 @@ from psycopg2.extensions import make_dsn, parse_dsn
|
||||
from typing_extensions import Literal
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
Fn,
|
||||
allure_add_grafana_links,
|
||||
allure_attach_from_dir,
|
||||
get_self_dir,
|
||||
subprocess_capture,
|
||||
@@ -1118,538 +1119,6 @@ def neon_env_builder(
|
||||
yield builder
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
def __init__(self, message, status_code: int):
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def verbose_error(self, res: requests.Response):
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
try:
|
||||
msg = res.json()["msg"]
|
||||
except: # noqa: E722
|
||||
msg = ""
|
||||
raise PageserverApiException(msg, res.status_code) from e
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def tenant_list(self) -> List[Dict[Any, Any]]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant",
|
||||
json={
|
||||
"new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
|
||||
new_tenant_id = res.json()
|
||||
assert isinstance(new_tenant_id, str)
|
||||
return TenantId(new_tenant_id)
|
||||
|
||||
def tenant_attach(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
|
||||
params = {}
|
||||
if detach_ignored:
|
||||
params["detach_ignored"] = "true"
|
||||
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_ignore(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
|
||||
self.verbose_error(res)
|
||||
return TenantConfig.from_json(res.json())
|
||||
|
||||
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/config",
|
||||
json={**config, "tenant_id": str(tenant_id)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def patch_tenant_config_client_side(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
inserts: Optional[Dict[str, Any]] = None,
|
||||
removes: Optional[List[str]] = None,
|
||||
):
|
||||
current = self.tenant_config(tenant_id).tenant_specific_overrides
|
||||
if inserts is not None:
|
||||
current.update(inserts)
|
||||
if removes is not None:
|
||||
for key in removes:
|
||||
del current[key]
|
||||
self.set_tenant_config(tenant_id, current)
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
|
||||
self.verbose_error(res)
|
||||
res = res.json()
|
||||
assert isinstance(res, dict)
|
||||
assert TenantId(res["id"]) == tenant_id
|
||||
size = res["size"]
|
||||
assert type(size) == int
|
||||
inputs = res["inputs"]
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||
"""
|
||||
Returns the tenant size debug info, as an HTML string
|
||||
"""
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
|
||||
headers={"Accept": "text/html"},
|
||||
)
|
||||
return res.text
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
params["include-non-incremental-logical-size"] = "true"
|
||||
if include_timeline_dir_layer_file_size_sum:
|
||||
params["include-timeline-dir-layer-file-size-sum"] = "true"
|
||||
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_start_lsn: Optional[Lsn] = None,
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline",
|
||||
json={
|
||||
"new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
|
||||
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
|
||||
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
|
||||
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_detail(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
params["include-non-incremental-logical-size"] = "true"
|
||||
if include_timeline_dir_layer_file_size_sum:
|
||||
params["include-timeline-dir-layer-file-size-sum"] = "true"
|
||||
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_gc(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
|
||||
) -> dict[str, Any]:
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(
|
||||
f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
|
||||
)
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
|
||||
json={"gc_horizon": gc_horizon},
|
||||
)
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
|
||||
)
|
||||
log.info(f"Got compact request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_get_lsn_by_timestamp(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
|
||||
):
|
||||
log.info(
|
||||
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
|
||||
)
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
|
||||
)
|
||||
log.info(f"Got checkpoint request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_spawn_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
) -> dict[str, Any]:
|
||||
body = {
|
||||
"max_concurrent_downloads": max_concurrent_downloads,
|
||||
}
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
|
||||
json=body,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_poll_download_remote_layers_status(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
spawn_response: dict[str, Any],
|
||||
poll_state=None,
|
||||
) -> None | dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
|
||||
# assumption in this API client here is that nobody else spawns the task
|
||||
assert res_json["task_id"] == spawn_response["task_id"]
|
||||
|
||||
if poll_state is None or res_json["state"] == poll_state:
|
||||
return res_json
|
||||
return None
|
||||
|
||||
def timeline_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
errors_ok=False,
|
||||
at_least_one_download=True,
|
||||
):
|
||||
res = self.timeline_spawn_download_remote_layers(
|
||||
tenant_id, timeline_id, max_concurrent_downloads
|
||||
)
|
||||
while True:
|
||||
completed = self.timeline_poll_download_remote_layers_status(
|
||||
tenant_id, timeline_id, res, poll_state="Completed"
|
||||
)
|
||||
if not completed:
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
if not errors_ok:
|
||||
assert completed["failed_download_count"] == 0
|
||||
if at_least_one_download:
|
||||
assert completed["successful_download_count"] > 0
|
||||
return completed
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
"""You probably want to use get_metrics() instead."""
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
self.verbose_error(res)
|
||||
return res.text
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.get_metrics_str()
|
||||
return parse_metrics(res)
|
||||
|
||||
def get_timeline_metric(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
|
||||
) -> float:
|
||||
metrics = self.get_metrics()
|
||||
return metrics.query_one(
|
||||
metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
).value
|
||||
|
||||
def get_remote_timeline_client_metric(
|
||||
self,
|
||||
metric_name: str,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: str,
|
||||
op_kind: str,
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
matches = metrics.query_all(
|
||||
name=metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"file_kind": str(file_kind),
|
||||
"op_kind": str(op_kind),
|
||||
},
|
||||
)
|
||||
if len(matches) == 0:
|
||||
value = None
|
||||
elif len(matches) == 1:
|
||||
value = matches[0].value
|
||||
assert value is not None
|
||||
else:
|
||||
assert len(matches) < 2, "above filter should uniquely identify metric"
|
||||
return value
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> LayerMapInfo:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return LayerMapInfo.from_json(res.json())
|
||||
|
||||
def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def disk_usage_eviction_run(self, request: dict[str, Any]):
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
|
||||
json=request,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def tenant_break(self, tenant_id: TenantId):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TenantConfig:
|
||||
tenant_specific_overrides: Dict[str, Any]
|
||||
effective_config: Dict[str, Any]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
|
||||
return TenantConfig(
|
||||
tenant_specific_overrides=d["tenant_specific_overrides"],
|
||||
effective_config=d["effective_config"],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LayerMapInfo:
|
||||
in_memory_layers: List[InMemoryLayerInfo]
|
||||
historic_layers: List[HistoricLayerInfo]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
|
||||
info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
|
||||
|
||||
json_in_memory_layers = d["in_memory_layers"]
|
||||
assert isinstance(json_in_memory_layers, List)
|
||||
for json_in_memory_layer in json_in_memory_layers:
|
||||
info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
|
||||
|
||||
json_historic_layers = d["historic_layers"]
|
||||
assert isinstance(json_historic_layers, List)
|
||||
for json_historic_layer in json_historic_layers:
|
||||
info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
|
||||
|
||||
return info
|
||||
|
||||
def kind_count(self) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = defaultdict(int)
|
||||
for inmem_layer in self.in_memory_layers:
|
||||
counts[inmem_layer.kind] += 1
|
||||
for hist_layer in self.historic_layers:
|
||||
counts[hist_layer.kind] += 1
|
||||
return counts
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryLayerInfo:
|
||||
kind: str
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
|
||||
return InMemoryLayerInfo(
|
||||
kind=d["kind"],
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HistoricLayerInfo:
|
||||
kind: str
|
||||
layer_file_name: str
|
||||
layer_file_size: Optional[int]
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
remote: bool
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
|
||||
return HistoricLayerInfo(
|
||||
kind=d["kind"],
|
||||
layer_file_name=d["layer_file_name"],
|
||||
layer_file_size=d.get("layer_file_size"),
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
remote=d["remote"],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageserverPort:
|
||||
pg: int
|
||||
@@ -2436,10 +1905,16 @@ def remote_pg(
|
||||
connstr = os.getenv("BENCHMARK_CONNSTR")
|
||||
if connstr is None:
|
||||
raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
|
||||
|
||||
start_ms = int(datetime.utcnow().timestamp() * 1000)
|
||||
with RemotePostgres(pg_bin, connstr) as remote_pg:
|
||||
yield remote_pg
|
||||
|
||||
end_ms = int(datetime.utcnow().timestamp() * 1000)
|
||||
host = parse_dsn(connstr).get("host", "")
|
||||
if host.endswith(".neon.build"):
|
||||
# Add 10s margin to the start and end times
|
||||
allure_add_grafana_links(host, start_ms - 10_000, end_ms + 10_000)
|
||||
|
||||
|
||||
class PSQL:
|
||||
"""
|
||||
@@ -3378,151 +2853,6 @@ def check_restored_datadir_content(
|
||||
assert (mismatch, error) == ([], [])
|
||||
|
||||
|
||||
def wait_until(number_of_iterations: int, interval: float, func):
|
||||
"""
|
||||
Wait until 'func' returns successfully, without exception. Returns the
|
||||
last return value from the function.
|
||||
"""
|
||||
last_exception = None
|
||||
for i in range(number_of_iterations):
|
||||
try:
|
||||
res = func()
|
||||
except Exception as e:
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
last_exception = e
|
||||
time.sleep(interval)
|
||||
continue
|
||||
return res
|
||||
raise Exception("timed out while waiting for %s" % func) from last_exception
|
||||
|
||||
|
||||
def wait_while(number_of_iterations: int, interval: float, func):
|
||||
"""
|
||||
Wait until 'func' returns false, or throws an exception.
|
||||
"""
|
||||
for i in range(number_of_iterations):
|
||||
try:
|
||||
if not func():
|
||||
return
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
time.sleep(interval)
|
||||
continue
|
||||
except Exception:
|
||||
return
|
||||
raise Exception("timed out while waiting for %s" % func)
|
||||
|
||||
|
||||
def assert_tenant_status(
|
||||
pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str
|
||||
):
|
||||
tenant_status = pageserver_http_client.tenant_status(tenant)
|
||||
log.info(f"tenant_status: {tenant_status}")
|
||||
assert tenant_status["state"] == expected_status, tenant_status
|
||||
|
||||
|
||||
def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId):
|
||||
tenants = ps_http.tenant_list()
|
||||
matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
|
||||
assert len(matching) < 2
|
||||
if len(matching) == 0:
|
||||
return None
|
||||
return matching[0]
|
||||
|
||||
|
||||
def remote_consistent_lsn(
|
||||
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
if detail["remote_consistent_lsn"] is None:
|
||||
# No remote information at all. This happens right after creating
|
||||
# a timeline, before any part of it has been uploaded to remote
|
||||
# storage yet.
|
||||
return Lsn(0)
|
||||
else:
|
||||
lsn_str = detail["remote_consistent_lsn"]
|
||||
assert isinstance(lsn_str, str)
|
||||
return Lsn(lsn_str)
|
||||
|
||||
|
||||
def wait_for_upload(
|
||||
pageserver_http_client: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
for i in range(20):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
log.info("wait finished")
|
||||
return
|
||||
log.info(
|
||||
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
||||
lsn, current_lsn
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Does not use `wait_until` for debugging purposes
|
||||
def wait_until_tenant_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
) -> bool:
|
||||
for _ in range(iterations):
|
||||
try:
|
||||
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
||||
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
||||
if tenant["state"] == expected_state:
|
||||
return True
|
||||
except Exception as e:
|
||||
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
|
||||
|
||||
|
||||
def last_record_lsn(
|
||||
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
lsn_str = detail["last_record_lsn"]
|
||||
assert isinstance(lsn_str, str)
|
||||
return Lsn(lsn_str)
|
||||
|
||||
|
||||
def wait_for_last_record_lsn(
|
||||
pageserver_http_client: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
for i in range(10):
|
||||
current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
log.info(
|
||||
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
"timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
|
||||
)
|
||||
|
||||
|
||||
def wait_for_last_flush_lsn(
|
||||
env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
@@ -3584,23 +2914,3 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
|
||||
return lsn
|
||||
|
||||
|
||||
def wait_for_upload_queue_empty(
|
||||
pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
ps_http = pageserver.http_client()
|
||||
while True:
|
||||
all_metrics = ps_http.get_metrics()
|
||||
tl = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
assert len(tl) > 0
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
||||
if all(m.value == 0 for m in tl):
|
||||
return
|
||||
time.sleep(0.2)
|
||||
|
||||
0
test_runner/fixtures/pageserver/__init__.py
Normal file
0
test_runner/fixtures/pageserver/__init__.py
Normal file
545
test_runner/fixtures/pageserver/http.py
Normal file
545
test_runner/fixtures/pageserver/http.py
Normal file
@@ -0,0 +1,545 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import Fn
|
||||
|
||||
|
||||
class PageserverApiException(Exception):
|
||||
def __init__(self, message, status_code: int):
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryLayerInfo:
|
||||
kind: str
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo:
|
||||
return InMemoryLayerInfo(
|
||||
kind=d["kind"],
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HistoricLayerInfo:
|
||||
kind: str
|
||||
layer_file_name: str
|
||||
layer_file_size: Optional[int]
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
remote: bool
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
|
||||
return HistoricLayerInfo(
|
||||
kind=d["kind"],
|
||||
layer_file_name=d["layer_file_name"],
|
||||
layer_file_size=d.get("layer_file_size"),
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
remote=d["remote"],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LayerMapInfo:
|
||||
in_memory_layers: List[InMemoryLayerInfo]
|
||||
historic_layers: List[HistoricLayerInfo]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo:
|
||||
info = LayerMapInfo(in_memory_layers=[], historic_layers=[])
|
||||
|
||||
json_in_memory_layers = d["in_memory_layers"]
|
||||
assert isinstance(json_in_memory_layers, List)
|
||||
for json_in_memory_layer in json_in_memory_layers:
|
||||
info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer))
|
||||
|
||||
json_historic_layers = d["historic_layers"]
|
||||
assert isinstance(json_historic_layers, List)
|
||||
for json_historic_layer in json_historic_layers:
|
||||
info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer))
|
||||
|
||||
return info
|
||||
|
||||
def kind_count(self) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = defaultdict(int)
|
||||
for inmem_layer in self.in_memory_layers:
|
||||
counts[inmem_layer.kind] += 1
|
||||
for hist_layer in self.historic_layers:
|
||||
counts[hist_layer.kind] += 1
|
||||
return counts
|
||||
|
||||
|
||||
@dataclass
|
||||
class TenantConfig:
|
||||
tenant_specific_overrides: Dict[str, Any]
|
||||
effective_config: Dict[str, Any]
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> TenantConfig:
|
||||
return TenantConfig(
|
||||
tenant_specific_overrides=d["tenant_specific_overrides"],
|
||||
effective_config=d["effective_config"],
|
||||
)
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
|
||||
super().__init__()
|
||||
self.port = port
|
||||
self.auth_token = auth_token
|
||||
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
|
||||
|
||||
if auth_token is not None:
|
||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||
|
||||
def verbose_error(self, res: requests.Response):
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
try:
|
||||
msg = res.json()["msg"]
|
||||
except: # noqa: E722
|
||||
msg = ""
|
||||
raise PageserverApiException(msg, res.status_code) from e
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
return res_json
|
||||
|
||||
def tenant_list(self) -> List[Dict[Any, Any]]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant",
|
||||
json={
|
||||
"new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
|
||||
new_tenant_id = res.json()
|
||||
assert isinstance(new_tenant_id, str)
|
||||
return TenantId(new_tenant_id)
|
||||
|
||||
def tenant_attach(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
|
||||
params = {}
|
||||
if detach_ignored:
|
||||
params["detach_ignored"] = "true"
|
||||
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_ignore(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def tenant_config(self, tenant_id: TenantId) -> TenantConfig:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config")
|
||||
self.verbose_error(res)
|
||||
return TenantConfig.from_json(res.json())
|
||||
|
||||
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/config",
|
||||
json={**config, "tenant_id": str(tenant_id)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def patch_tenant_config_client_side(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
inserts: Optional[Dict[str, Any]] = None,
|
||||
removes: Optional[List[str]] = None,
|
||||
):
|
||||
current = self.tenant_config(tenant_id).tenant_specific_overrides
|
||||
if inserts is not None:
|
||||
current.update(inserts)
|
||||
if removes is not None:
|
||||
for key in removes:
|
||||
del current[key]
|
||||
self.set_tenant_config(tenant_id, current)
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
|
||||
self.verbose_error(res)
|
||||
res = res.json()
|
||||
assert isinstance(res, dict)
|
||||
assert TenantId(res["id"]) == tenant_id
|
||||
size = res["size"]
|
||||
assert type(size) == int
|
||||
inputs = res["inputs"]
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||
"""
|
||||
Returns the tenant size debug info, as an HTML string
|
||||
"""
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
|
||||
headers={"Accept": "text/html"},
|
||||
)
|
||||
return res.text
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
params["include-non-incremental-logical-size"] = "true"
|
||||
if include_timeline_dir_layer_file_size_sum:
|
||||
params["include-timeline-dir-layer-file-size-sum"] = "true"
|
||||
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_start_lsn: Optional[Lsn] = None,
|
||||
) -> Dict[Any, Any]:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline",
|
||||
json={
|
||||
"new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
|
||||
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
|
||||
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
if res.status_code == 409:
|
||||
raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
|
||||
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_detail(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
include_non_incremental_logical_size: bool = False,
|
||||
include_timeline_dir_layer_file_size_sum: bool = False,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
params = {}
|
||||
if include_non_incremental_logical_size:
|
||||
params["include-non-incremental-logical-size"] = "true"
|
||||
if include_timeline_dir_layer_file_size_sum:
|
||||
params["include-timeline-dir-layer-file-size-sum"] = "true"
|
||||
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
params=params,
|
||||
**kwargs,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_gc(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
|
||||
) -> dict[str, Any]:
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(
|
||||
f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
|
||||
)
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc",
|
||||
json={"gc_horizon": gc_horizon},
|
||||
)
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
|
||||
)
|
||||
log.info(f"Got compact request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_get_lsn_by_timestamp(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
|
||||
):
|
||||
log.info(
|
||||
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
|
||||
)
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
|
||||
)
|
||||
log.info(f"Got checkpoint request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_spawn_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
) -> dict[str, Any]:
|
||||
body = {
|
||||
"max_concurrent_downloads": max_concurrent_downloads,
|
||||
}
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
|
||||
json=body,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_poll_download_remote_layers_status(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
spawn_response: dict[str, Any],
|
||||
poll_state=None,
|
||||
) -> None | dict[str, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is not None
|
||||
assert isinstance(res_json, dict)
|
||||
|
||||
# assumption in this API client here is that nobody else spawns the task
|
||||
assert res_json["task_id"] == spawn_response["task_id"]
|
||||
|
||||
if poll_state is None or res_json["state"] == poll_state:
|
||||
return res_json
|
||||
return None
|
||||
|
||||
def timeline_download_remote_layers(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
errors_ok=False,
|
||||
at_least_one_download=True,
|
||||
):
|
||||
res = self.timeline_spawn_download_remote_layers(
|
||||
tenant_id, timeline_id, max_concurrent_downloads
|
||||
)
|
||||
while True:
|
||||
completed = self.timeline_poll_download_remote_layers_status(
|
||||
tenant_id, timeline_id, res, poll_state="Completed"
|
||||
)
|
||||
if not completed:
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
if not errors_ok:
|
||||
assert completed["failed_download_count"] == 0
|
||||
if at_least_one_download:
|
||||
assert completed["successful_download_count"] > 0
|
||||
return completed
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
"""You probably want to use get_metrics() instead."""
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
self.verbose_error(res)
|
||||
return res.text
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.get_metrics_str()
|
||||
return parse_metrics(res)
|
||||
|
||||
def get_timeline_metric(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
|
||||
) -> float:
|
||||
metrics = self.get_metrics()
|
||||
return metrics.query_one(
|
||||
metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
).value
|
||||
|
||||
def get_remote_timeline_client_metric(
|
||||
self,
|
||||
metric_name: str,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
file_kind: str,
|
||||
op_kind: str,
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
matches = metrics.query_all(
|
||||
name=metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
"file_kind": str(file_kind),
|
||||
"op_kind": str(op_kind),
|
||||
},
|
||||
)
|
||||
if len(matches) == 0:
|
||||
value = None
|
||||
elif len(matches) == 1:
|
||||
value = matches[0].value
|
||||
assert value is not None
|
||||
else:
|
||||
assert len(matches) < 2, "above filter should uniquely identify metric"
|
||||
return value
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> LayerMapInfo:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return LayerMapInfo.from_json(res.json())
|
||||
|
||||
def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str):
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
def disk_usage_eviction_run(self, request: dict[str, Any]):
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
|
||||
json=request,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def tenant_break(self, tenant_id: TenantId):
|
||||
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
|
||||
self.verbose_error(res)
|
||||
145
test_runner/fixtures/pageserver/utils.py
Normal file
145
test_runner/fixtures/pageserver/utils.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
|
||||
def assert_tenant_status(
|
||||
pageserver_http: PageserverHttpClient, tenant: TenantId, expected_status: str
|
||||
):
|
||||
tenant_status = pageserver_http.tenant_status(tenant)
|
||||
log.info(f"tenant_status: {tenant_status}")
|
||||
assert tenant_status["state"] == expected_status, tenant_status
|
||||
|
||||
|
||||
def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
|
||||
tenants = pageserver_http.tenant_list()
|
||||
matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
|
||||
assert len(matching) < 2
|
||||
if len(matching) == 0:
|
||||
return None
|
||||
return matching[0]
|
||||
|
||||
|
||||
def remote_consistent_lsn(
|
||||
pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
detail = pageserver_http.timeline_detail(tenant, timeline)
|
||||
|
||||
if detail["remote_consistent_lsn"] is None:
|
||||
# No remote information at all. This happens right after creating
|
||||
# a timeline, before any part of it has been uploaded to remote
|
||||
# storage yet.
|
||||
return Lsn(0)
|
||||
else:
|
||||
lsn_str = detail["remote_consistent_lsn"]
|
||||
assert isinstance(lsn_str, str)
|
||||
return Lsn(lsn_str)
|
||||
|
||||
|
||||
def wait_for_upload(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
for i in range(20):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
log.info("wait finished")
|
||||
return
|
||||
log.info(
|
||||
"waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
"timed out while waiting for remote_consistent_lsn to reach {}, was {}".format(
|
||||
lsn, current_lsn
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def wait_until_tenant_state(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
expected_state: str,
|
||||
iterations: int,
|
||||
) -> bool:
|
||||
"""
|
||||
Does not use `wait_until` for debugging purposes
|
||||
"""
|
||||
for _ in range(iterations):
|
||||
try:
|
||||
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
||||
log.debug(f"Tenant {tenant_id} data: {tenant}")
|
||||
if tenant["state"] == expected_state:
|
||||
return True
|
||||
except Exception as e:
|
||||
log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
|
||||
|
||||
|
||||
def wait_until_tenant_active(
|
||||
pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int = 30
|
||||
):
|
||||
wait_until_tenant_state(
|
||||
pageserver_http, tenant_id, expected_state="Active", iterations=iterations
|
||||
)
|
||||
|
||||
|
||||
def last_record_lsn(
|
||||
pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
|
||||
) -> Lsn:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
lsn_str = detail["last_record_lsn"]
|
||||
assert isinstance(lsn_str, str)
|
||||
return Lsn(lsn_str)
|
||||
|
||||
|
||||
def wait_for_last_record_lsn(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant: TenantId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
for i in range(10):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
log.info(
|
||||
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
"timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
|
||||
)
|
||||
|
||||
|
||||
def wait_for_upload_queue_empty(
|
||||
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
while True:
|
||||
all_metrics = pageserver_http.get_metrics()
|
||||
tl = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
assert len(tl) > 0
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
||||
if all(m.value == 0 for m in tl):
|
||||
return
|
||||
time.sleep(0.2)
|
||||
@@ -1,4 +1,5 @@
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
@@ -6,8 +7,9 @@ import tarfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Tuple, TypeVar
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import allure # type: ignore
|
||||
import allure
|
||||
from psycopg2.extensions import cursor
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -184,6 +186,46 @@ def allure_attach_from_dir(dir: Path):
|
||||
allure.attach.file(source, name, attachment_type, extension)
|
||||
|
||||
|
||||
DATASOURCE_ID = "xHHYY0dVz"
|
||||
|
||||
|
||||
def allure_add_grafana_links(host: str, start_ms: int, end_ms: int):
|
||||
"""Add links to server logs in Grafana to Allure report"""
|
||||
# We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build
|
||||
endpoint_id, region_id, _ = host.split(".", 2)
|
||||
|
||||
expressions = {
|
||||
"compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}',
|
||||
"k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"',
|
||||
"console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"',
|
||||
"proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}',
|
||||
}
|
||||
|
||||
params: Dict[str, Any] = {
|
||||
"datasource": DATASOURCE_ID,
|
||||
"queries": [
|
||||
{
|
||||
"expr": "<PUT AN EXPRESSION HERE>",
|
||||
"refId": "A",
|
||||
"datasource": {"type": "loki", "uid": DATASOURCE_ID},
|
||||
"editorMode": "code",
|
||||
"queryType": "range",
|
||||
}
|
||||
],
|
||||
"range": {
|
||||
"from": str(start_ms),
|
||||
"to": str(end_ms),
|
||||
},
|
||||
}
|
||||
for name, expr in expressions.items():
|
||||
params["queries"][0]["expr"] = expr
|
||||
query_string = urlencode({"orgId": 1, "left": json.dumps(params)})
|
||||
link = f"https://neonprod.grafana.net/explore?{query_string}"
|
||||
|
||||
allure.dynamic.link(link, name=name)
|
||||
log.info(f"{name}: {link}")
|
||||
|
||||
|
||||
def start_in_background(
|
||||
command: list[str], cwd: Path, log_file_name: str, is_started: Fn
|
||||
) -> subprocess.Popen[bytes]:
|
||||
@@ -236,3 +278,19 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
|
||||
continue
|
||||
return res
|
||||
raise Exception("timed out while waiting for %s" % func) from last_exception
|
||||
|
||||
|
||||
def wait_while(number_of_iterations: int, interval: float, func):
|
||||
"""
|
||||
Wait until 'func' returns false, or throws an exception.
|
||||
"""
|
||||
for i in range(number_of_iterations):
|
||||
try:
|
||||
if not func():
|
||||
return
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
time.sleep(interval)
|
||||
continue
|
||||
except Exception:
|
||||
return
|
||||
raise Exception("timed out while waiting for %s" % func)
|
||||
|
||||
@@ -10,7 +10,7 @@ import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.compare_fixtures import NeonCompare
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import wait_for_last_record_lsn
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.types import Lsn
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PgProtocol
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.types import TenantId
|
||||
|
||||
|
||||
|
||||
@@ -10,12 +10,11 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonCli,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
PgBin,
|
||||
PortDistributor,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn
|
||||
from pytest import FixtureRequest
|
||||
|
||||
@@ -34,9 +33,15 @@ from pytest import FixtureRequest
|
||||
# - check_neon_works performs the test itself, feel free to add more checks there.
|
||||
#
|
||||
|
||||
check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
|
||||
os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None,
|
||||
reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set",
|
||||
)
|
||||
|
||||
|
||||
# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
|
||||
# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(before="test_forward_compatibility")
|
||||
def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
|
||||
@@ -81,6 +86,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
|
||||
# Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
|
||||
|
||||
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(after="test_create_snapshot")
|
||||
def test_backward_compatibility(
|
||||
@@ -134,6 +140,7 @@ def test_backward_compatibility(
|
||||
), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
||||
|
||||
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(after="test_create_snapshot")
|
||||
def test_forward_compatibility(
|
||||
|
||||
@@ -11,14 +11,14 @@ from fixtures.neon_fixtures import (
|
||||
LocalFsStorage,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
PgBin,
|
||||
RemoteStorageKind,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_upload_queue_empty
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
|
||||
|
||||
@@ -138,7 +138,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
|
||||
# remove the initial tenant
|
||||
## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
|
||||
assert env.initial_timeline
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline)
|
||||
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline)
|
||||
pageserver_http.tenant_detach(env.initial_tenant)
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
|
||||
@@ -182,7 +182,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
|
||||
# after stopping the safekeepers, we know that no new WAL will be coming in
|
||||
for tenant_id, timeline_id in timelines:
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id)
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
|
||||
assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
|
||||
|
||||
@@ -13,9 +13,8 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
Postgres,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import subprocess_capture
|
||||
|
||||
|
||||
@@ -6,10 +6,9 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -5,8 +5,8 @@ from fixtures.neon_fixtures import (
|
||||
DEFAULT_BRANCH_NAME,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
|
||||
|
||||
def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
|
||||
|
||||
@@ -10,20 +10,20 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
available_remote_storages,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_status,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_until,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
|
||||
|
||||
@@ -6,8 +6,8 @@ from fixtures.neon_fixtures import (
|
||||
DEFAULT_BRANCH_NAME,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -13,13 +13,15 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
LocalFsStorage,
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
@@ -172,15 +174,10 @@ def test_remote_storage_backup_and_restore(
|
||||
client.tenant_attach(tenant_id)
|
||||
log.info("waiting for tenant to become active. this should be quick with on-demand download")
|
||||
|
||||
def tenant_active():
|
||||
all_states = client.tenant_list()
|
||||
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
|
||||
assert tenant["state"] == "Active"
|
||||
|
||||
wait_until(
|
||||
number_of_iterations=5,
|
||||
interval=1,
|
||||
func=tenant_active,
|
||||
wait_until_tenant_active(
|
||||
pageserver_http=client,
|
||||
tenant_id=tenant_id,
|
||||
iterations=5,
|
||||
)
|
||||
|
||||
detail = client.timeline_detail(tenant_id, timeline_id)
|
||||
@@ -357,12 +354,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
def tenant_active():
|
||||
all_states = client.tenant_list()
|
||||
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
|
||||
assert tenant["state"] == "Active"
|
||||
|
||||
wait_until(30, 1, tenant_active)
|
||||
wait_until_tenant_active(client, tenant_id)
|
||||
|
||||
log.info("restarting postgres to validate")
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
@@ -497,12 +489,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
|
||||
client.tenant_attach(tenant_id)
|
||||
|
||||
def tenant_active():
|
||||
all_states = client.tenant_list()
|
||||
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
|
||||
assert tenant["state"] == "Active"
|
||||
|
||||
wait_until(30, 1, tenant_active)
|
||||
wait_until_tenant_active(client, tenant_id)
|
||||
|
||||
log.info("restarting postgres to validate")
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
|
||||
@@ -6,9 +6,8 @@ from fixtures.neon_fixtures import (
|
||||
LocalFsStorage,
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import assert_tenant_status, wait_for_upload
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@@ -9,18 +9,18 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_until,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
|
||||
|
||||
def do_gc_target(
|
||||
|
||||
@@ -10,18 +10,24 @@ from fixtures.neon_fixtures import (
|
||||
NeonBroker,
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
PortDistributor,
|
||||
Postgres,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_status,
|
||||
tenant_exists,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
query_scalar,
|
||||
start_in_background,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
wait_while,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, start_in_background, subprocess_capture
|
||||
|
||||
|
||||
def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
|
||||
|
||||
@@ -6,11 +6,11 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverHttpClient,
|
||||
Postgres,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_wal_insert_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
|
||||
|
||||
@@ -20,10 +20,12 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
available_remote_storages,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_status,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnv, PageserverApiException
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@@ -14,20 +14,21 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
PgBin,
|
||||
PortDistributor,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
VanillaPostgres,
|
||||
assert_tenant_status,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_tenant_status,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import get_timeline_dir_size
|
||||
from fixtures.utils import get_timeline_dir_size, wait_until
|
||||
|
||||
|
||||
def test_timeline_size(neon_simple_env: NeonEnv):
|
||||
@@ -246,12 +247,7 @@ def test_timeline_initial_logical_size_calculation_cancellation(
|
||||
extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
|
||||
)
|
||||
|
||||
def tenant_active():
|
||||
all_states = client.tenant_list()
|
||||
[tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id]
|
||||
assert tenant["state"] == "Active"
|
||||
|
||||
wait_until(30, 1, tenant_active)
|
||||
wait_until_tenant_active(client, tenant_id)
|
||||
|
||||
# kick off initial size calculation task (the response we get here is the estimated size)
|
||||
def assert_size_calculation_not_done():
|
||||
@@ -425,7 +421,7 @@ def test_timeline_physical_size_post_compaction(
|
||||
pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
@@ -478,7 +474,7 @@ def test_timeline_physical_size_post_gc(
|
||||
pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
@@ -584,7 +580,7 @@ def test_tenant_physical_size(
|
||||
|
||||
tenant, timeline = env.neon_cli.create_tenant()
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
|
||||
|
||||
def get_timeline_resident_physical_size(timeline: TimelineId):
|
||||
sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
|
||||
@@ -609,7 +605,7 @@ def test_tenant_physical_size(
|
||||
pageserver_http.timeline_checkpoint(tenant, timeline)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant, timeline)
|
||||
|
||||
timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
|
||||
|
||||
|
||||
@@ -30,9 +30,8 @@ from fixtures.neon_fixtures import (
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperPort,
|
||||
available_remote_storages,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import get_dir_size, query_scalar, start_in_background
|
||||
|
||||
|
||||
@@ -3,7 +3,8 @@ import time
|
||||
import psutil
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.types import TenantId
|
||||
|
||||
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 757df1dab8...3e70693c91
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: f8a650e49b...4ad87b0f36
Reference in New Issue
Block a user